Total coverage: 174390 (15%)of 1210379
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dccp #if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DCCP_H #include <net/sock.h> #include "dccp.h" #include "ccids/ccid3.h" #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> TRACE_EVENT(dccp_probe, TP_PROTO(struct sock *sk, size_t size), TP_ARGS(sk, size), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, size) __field(__u16, tx_s) __field(__u32, tx_rtt) __field(__u32, tx_p) __field(__u32, tx_x_calc) __field(__u64, tx_x_recv) __field(__u64, tx_x) __field(__u32, tx_t_ipi) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hc = NULL; if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hc = ccid3_hc_tx_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->size = size; if (hc) { __entry->tx_s = hc->tx_s; __entry->tx_rtt = hc->tx_rtt; __entry->tx_p = hc->tx_p; __entry->tx_x_calc = hc->tx_x_calc; __entry->tx_x_recv = hc->tx_x_recv >> 6; __entry->tx_x = hc->tx_x >> 6; __entry->tx_t_ipi = hc->tx_t_ipi; } else { __entry->tx_s = 0; memset_startat(__entry, 0, tx_rtt); } ), TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", __entry->saddr, __entry->daddr, __entry->size, __entry->tx_s, __entry->tx_rtt, __entry->tx_p, __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, __entry->tx_t_ipi) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
2861 9376 2741 2740 2741 2713 2371 1138 17 3055 447 2651 795 974 3055 3624 3209 571 701 3628 534 3222 3219 3619 3619 3 1571 2186 435 3551 416 763 8 40 731 598 3617 265 5121 2 5114 4520 4525 236 238 3818 3817 3674 6554 3737 1178 2666 5635 785 49 781 7 7 7 286 317 309 14 4 2 2 2 14 14 14 1101 1098 29 1099 1101 1101 29 29 2 27 29 1101 14 14 1099 1101 6081 6079 1100 1101 1052 1101 1099 321 1101 42 1100 1100 1101 1100 14 1101 1101 1101 1099 1101 1101 1100 1101 1101 2 1101 1106 1108 1110 48 1098 8 1098 1098 1100 1101 1100 1098 1101 1101 15 2948 1 676 627 121 677 677 2139 121 2962 3006 1572 3005 3007 21 21 3002 148 147 21 21 21 3 169 3018 2775 2769 530 3029 172 1028 1029 3166 2685 3160 8 3166 123 3139 3159 8 4 2844 3166 2738 1 3470 3469 3470 630 3470 3659 3656 3662 5 3659 3732 3477 1122 3724 425 221 338 3656 3628 3628 3626 3628 3629 3621 207 206 206 206 207 207 6603 6600 6492 6495 6488 400 400 400 400 400 400 400 400 400 20 380 380 281 380 380 380 1 4 376 376 380 20 17 20 20 20 112 112 112 112 112 8 112 112 112 102 10 112 112 112 112 112 112 112 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 2 2 10354 2842 38 38 266 266 266 32 240 41 225 29 225 758 901 9 178 5533 878 1443 878 1443 3162 2251 952 3165 1228 3166 2248 953 2 3162 5 900 52 52 2253 3166 4 3162 3166 2252 952 3166 2251 956 2252 1 952 952 3165 23 23 23 1 10 2 26 6 6 21 21 93 1165 354 1172 736 11 737 3473 5 3474 1263 2265 11 26 52 1186 737 736 2 735 3166 127 127 23 23 9 9 405 512 118 405 33 95 87 9207 2828 11 11 167 21 11 125 58 58 1 58 58 135 28 135 100 88 59 135 1 135 135 58 100 16 84 77 135 135 135 135 16 123 7 92 87 58 100 135 135 11 130 135 135 28 28 28 28 907 4399 9178 6365 6360 4393 4400 4392 4403 2 4389 4392 4387 9182 12 1178 1088 256 353 928 687 247 2670 2673 2467 282 245 34 2667 929 690 282 639 290 835 500 3 928 928 384 547 929 927 929 2162 561 258 258 2326 1807 901 273 687 690 14 2 76 76 49 34 34 34 49 2 55 249 241 15 15 2 5 247 1 1 2576 76 273 2769 89 3819 10185 11213 168 1 25 3481 13 12285 12284 12285 12320 12613 12602 397 1826 127 10976 12605 7 8929 6785 12601 12610 12614 12634 9829 3414 12605 459 12321 9821 3412 3411 12641 13314 30 12 12 1 12 13263 13314 8648 5218 12 5472 4960 4963 5469 5474 4963 3705 3707 3705 3609 231 3422 3422 2038 2046 2046 2404 2409 2403 21 21 21 21 21 69 69 68 68 58 60 12 3 9 13 46 59 58 48 21 21 427 14 60 437 427 428 428 16 16 16 2299 1970 409 2302 1970 408 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/kmsan.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "pgalloc-track.h" #include "internal.h" #include "swap.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); /* * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { if (!userfaultfd_wp(vmf->vma)) return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; return pte_marker_uffd_wp(vmf->orig_pte); } /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { /* * Transitioning a PTE from 'old' to 'young' can be expensive on * some architectures, even if it's performed in hardware. By * default, "false" means prefaulted entries will be 'young'. */ return false; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); } /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { struct unlink_vma_file_batch vb; do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; /* * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); if (is_vm_hugetlb_page(vma)) { unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { unlink_file_vma_batch_init(&vb); unlink_file_vma_batch_add(&vb, vma); /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) { spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd) { pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The only exception are zeropages, which are * *never* refcounted. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) /* * NOTE: New users of ZONE_DEVICE will not set pte_devmap() * and will have refcounts incremented on their struct pages * when they are inserted into PTEs, thus they are safe to * return here. Legacy ZONE_DEVICE pages that set pte_devmap() * do not have refcounts. Example of legacy ZONE_DEVICE is * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. */ return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; if (is_zero_pfn(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: VM_WARN_ON_ONCE(is_zero_pfn(pfn)); return pfn_to_page(pfn); } struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page = vm_normal_page(vma, addr, pte); if (page) return page_folio(page); return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* Currently it's only used for huge pfnmaps */ if (unlikely(pmd_special(pmd))) return NULL; if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_huge_zero_pmd(pmd)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page = vm_normal_page_pmd(vma, addr, pmd); if (page) return page_folio(page); return NULL; } #endif /** * restore_exclusive_pte - Restore a device-exclusive entry * @vma: VMA covering @address * @folio: the mapped folio * @page: the mapped folio page * @address: the virtual address * @ptep: pte pointer into the locked page table mapping the folio page * @orig_pte: pte value at @ptep * * Restore a device-exclusive non-swap entry to an ordinary present pte. * * The folio and the page table must be locked, and MMU notifiers must have * been called to invalidate any (exclusive) device mappings. * * Locking the folio makes sure that anybody who just converted the pte to * a device-exclusive entry can map it into the device to make forward * progress without others converting it back until the folio was unlocked. * * If the folio lock ever becomes an issue, we can stop relying on the folio * lock; it might make some scenarios with heavy thrashing less likely to * make forward progress, but these scenarios might not be valid use cases. * * Note that the folio lock does not protect against all cases of concurrent * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers * must use MMU notifiers to sync against any concurrent changes. */ static void restore_exclusive_pte(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long address, pte_t *ptep, pte_t orig_pte) { pte_t pte; VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); if ((vma->vm_flags & VM_WRITE) && can_change_pte_writable(vma, address, pte)) { if (folio_test_dirty(folio)) pte = pte_mkdirty(pte); pte = pte_mkwrite(pte, vma); } set_pte_at(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. */ update_mmu_cache(vma, address, ptep); } /* * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); folio_unlock(folio); return 0; } return -EBUSY; } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ if (pte_swp_exclusive(orig_pte)) { pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child * to be set to read. A previously exclusive entry is * now shared. */ entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_exclusive_entry(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device * exclusive entries currently only support private writable * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) set_pte_at(dst_mm, addr, dst_pte, make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } /* * Copy a present and normal page. * * NOTE! The usual case is that this isn't required; * instead, the caller can just increase the page refcount * and re-use the pte the traditional way. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation * code know so that it can do so outside the page table * lock. */ static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct folio **prealloc, struct page *page) { struct folio *new_folio; pte_t pte; new_folio = *prealloc; if (!new_folio) return -EAGAIN; /* * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) return -EHWPOISON; *prealloc = NULL; __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } /* If it's a shared mapping, mark it clean in the child. */ if (src_vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. * * Returns -EAGAIN if one preallocated page is required to copy the next PTE. * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; bool any_writable; fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); /* * If we likely have to copy, just don't bother with batching. Make * sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { if (src_vma->vm_flags & VM_SHARED) flags |= FPB_IGNORE_DIRTY; if (!vma_soft_dirty_enabled(src_vma)) flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } if (any_writable) pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; } folio_get(folio); if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } copy_pte: __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { folio_put(new_folio); return NULL; } folio_throttle_swaprate(new_folio, GFP_KERNEL); return new_folio; } static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; int nr; again: progress = 0; init_rss_vec(rss); /* * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the * error handling here, assume that exclusive mmap_lock on dst and src * protects anon from unexpected THP transitions; with shmem and file * protected by mmap_lock-less collapse skipping areas with anon_vma * (whereas vma_needs_copy() skips areas without anon_vma). A rework * can remove such assumptions later, but this is good enough for now. */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } /* * We already hold the exclusive mmap_lock, the copy_pte_range() and * retract_page_tables() are using vma->anon_vma to be exclusive, so * the PTE page is stable, and there is no need to get pmdval and do * pmd_same() check. */ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ goto out; } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { nr = 1; /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } ptent = ptep_get(src_pte); if (pte_none(ptent)) { progress++; continue; } if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; } else if (!ret) { progress += 8; continue; } ptent = ptep_get(src_pte); VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying * the now present pte. */ WARN_ON_ONCE(ret != -ENOENT); } /* copy_present_ptes() will clear `*prealloc' if consumed */ max_nr = (end - addr) / PAGE_SIZE; ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. * If copy failed due to hwpoison in source page, break out. */ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* * pre-alloc page cannot be reused by next time so as * to strictly follow mempolicy (e.g., alloc_page_vma() * will allocate page according to address). This * could only happen if one pinned pte changed. */ folio_put(prealloc); prealloc = NULL; } nr = ret; progress += 8 * nr; } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret < 0) { VM_WARN_ON_ONCE(1); } /* We've captured and resolved the error. Reset, try again. */ ret = 0; if (addr != end) goto again; out: if (unlikely(prealloc)) folio_put(prealloc); return ret; } static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } /* * Return true if the vma needs to copy the pgtable during this fork(). Return * false when we can speed up fork() by allowing lazy page faults later until * when the child accesses the memory range. */ static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { /* * Always copy pgtables when dst_vma has uffd-wp enabled even if it's * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable * contains uffd-wp protection information, that's something we can't * retrieve from page cache, and skip copying will lose those info. */ if (userfaultfd_wp(dst_vma)) return true; if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) return true; /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly * mappings. The tradeoff is that copy_page_range is more efficient * than faulting. */ return false; } int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; unsigned long next, pfn = 0; bool is_cow; int ret; if (!vma_needs_copy(dst_vma, src_vma)) return 0; if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as * the read side doesn't spin, but goes to the mmap_lock. * * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) { raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) untrack_pfn_copy(dst_vma, pfn); return ret; } /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ return details->even_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ static inline bool should_zap_folio(struct zap_details *details, struct folio *folio) { /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; return details->zap_flags & ZAP_FLAG_DROP_MARKER; } /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. * * Returns true if uffd-wp ptes was installed, false otherwise. */ static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { bool was_installed = false; #ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; if (zap_drop_markers(details)) return false; for (;;) { /* the PFN in the PTE is irrelevant. */ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } #endif return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = true; *force_flush = true; } } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); rss[MM_ANONPAGES] -= nr; } /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } /* * Zap or skip at least one present PTE, trying to batch-process subsequent * PTEs that map consecutive pages of the same folio. * * Returns the number of processed (skipped or zapped) PTEs (at least 1). */ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { /* We don't need up-to-date accessed/dirty bits. */ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) { *any_skipped = true; return 1; } /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, details, rss, force_flush, force_break, any_skipped); return 1; } static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { swp_entry_t entry; int nr = 1; *any_skipped = true; entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return 1; /* * Both device private/exclusive mappings should only * work with anonymous page so far, so we don't need to * consider uffd-wp bit when zap. For more information, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; } else if (is_guard_swp_entry(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers * in general. */ if (!zap_drop_markers(details)) return 1; } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) return 1; } else { /* We should have covered all the swap entry types */ pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); return nr; } static inline int do_zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, unsigned long addr, unsigned long end, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { pte_t ptent = ptep_get(pte); int max_nr = (end - addr) / PAGE_SIZE; int nr = 0; /* Skip all consecutive none ptes */ if (pte_none(ptent)) { for (nr = 1; nr < max_nr; nr++) { ptent = ptep_get(pte + nr); if (!pte_none(ptent)) break; } max_nr -= nr; if (!max_nr) return nr; pte += nr; addr += nr * PAGE_SIZE; } if (pte_present(ptent)) nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, force_flush, force_break, any_skipped); else nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, any_skipped); return nr; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); bool direct_reclaim = true; int nr; retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return addr; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { bool any_skipped = false; if (need_resched()) { direct_reclaim = false; break; } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); if (any_skipped) can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); /* * Fast path: try to hold the pmd lock and unmap the PTE page. * * If the pte lock was released midway (retry case), or if the attempt * to hold the pmd lock failed, then we need to recheck all pte entries * to ensure they are still none, thereby preventing the pte entries * from being repopulated by another thread. */ if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_rmaps(tlb, vma); } pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Come back again if we didn't do everything. */ if (force_flush) tlb_flush_mmu(tlb); if (addr != end) { cond_resched(); force_flush = false; force_break = false; goto retry; } if (can_reclaim_pt) { if (direct_reclaim) free_pte(mm, start, tlb, pmdval); else try_to_free_pte(mm, pmd, start, tlb); } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } if (pmd_none(*pmd)) { addr = next; continue; } addr = zap_pte_range(tlb, vma, pmd, addr, next, details); if (addr != next) pmd--; } while (pmd++, cond_resched(), addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long tree_end, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); if (!pmd) return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) { VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); /* * Whoever wants to forbid the zeropage after some zeropages * might already have been mapped has to scan the page tables and * bail out on any zeropages. Zeropages in COW mappings can * be unshared using FAULT_FLAG_UNSHARE faults. */ if (mm_forbids_zeropage(vma->vm_mm)) return false; /* zeropages in COW mappings are common and unproblematic. */ if (is_cow_mapping(vma->vm_flags)) return true; /* Mappings that do not allow for writable PTEs are unproblematic. */ if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) return true; /* * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could * find the shared zeropage and longterm-pin it, which would * be problematic as soon as the zeropage gets replaced by a different * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would * now differ to what GUP looked up. FSDAX is incompatible to * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see * check_vma_flags). */ return vma->vm_ops && vma->vm_ops->pfn_mkwrite && (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); } static int validate_page_before_insert(struct vm_area_struct *vma, struct page *page) { struct folio *folio = page_folio(page); if (!folio_ref_count(folio)) return -EINVAL; if (unlikely(is_zero_folio(folio))) { if (!vm_mixed_zeropage_allowed(vma)) return -EINVAL; return 0; } if (folio_test_anon(folio) || folio_test_slab(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); pte_t pteval = ptep_get(pte); if (!pte_none(pteval)) { if (!mkwrite) return -EBUSY; /* see insert_pfn(). */ if (pte_pfn(pteval) != page_to_pfn(page)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); return -EFAULT; } pteval = maybe_mkwrite(pteval, vma); pteval = pte_mkyoung(pteval); if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) update_mmu_cache(vma, addr, pte); return 0; } /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); pteval = mk_pte(page, prot); if (mkwrite) { pteval = pte_mkyoung(pteval); pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); } inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; spinlock_t *ptl; retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; } static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; err = validate_page_before_insert(vma, page); if (err) return err; return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; pte_t *start_pte, *pte; spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; unsigned long pages_to_write_in_pmd; int ret; more: ret = -EFAULT; pmd = walk_to_pmd(mm, addr); if (!pmd) goto out; pages_to_write_in_pmd = min_t(unsigned long, remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); if (!start_pte) { ret = -EFAULT; goto out; } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; } addr += PAGE_SIZE; ++curr_page_idx; } pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } if (remaining_pages_total) goto more; ret = 0; out: *num = remaining_pages_total; return ret; } /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. * @vma: user vma to map to * @addr: target start user address of these pages * @pages: source kernel pages * @num: in: number of pages to map. out: number of pages that were *not* * mapped. (0 means all pages were successfully mapped). * * Preferred over vm_insert_page() when inserting multiple pages. * * In case of error, we may have mapped a subset of the provided * pages. It is the caller's responsibility to account for this case. * * The same restrictions apply as in vm_insert_page(). */ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_pages); /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. The zeropage is supported in some VMAs, * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); /* * __vm_map_pages - maps range of kernel pages into user vma * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. * The zeropage is supported in some VMAs, see * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num, unsigned long offset) { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ if (count > num - offset) return -ENXIO; for (i = 0; i < count; i++) { ret = vm_insert_page(vma, uaddr, pages[offset + i]); if (ret < 0) return ret; uaddr += PAGE_SIZE; } return 0; } /** * vm_map_pages - maps range of kernel pages starts with non zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Maps an object consisting of @num pages, catering for the user's * requested vm_pgoff * * If we fail to insert any page into the vma, the function will return * immediately leaving any previously inserted pages present. Callers * from the mmap handler may immediately return the error as their caller * will destroy the vma, removing any successfully inserted pages. Other * callers should make their own arrangements for calling unmap_region(). * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, vma->vm_pgoff); } EXPORT_SYMBOL(vm_map_pages); /** * vm_map_pages_zero - map range of kernel pages starts with zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Similar to vm_map_pages(), except that it explicitly sets the offset * to 0. This function is intended for the drivers that did not consider * vm_pgoff. * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, 0); } EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; entry = ptep_get(pte); if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_NOPAGE; } /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * * pgprot typically only differs from @vma->vm_page_prot when drivers set * caching- and encryption bits different than those of @vma->vm_page_prot, * because the caching- or encryption mode may not be known at mmap() time. * * This is ok as long as @vma->vm_page_prot is not used by the core vm * to set caching and encryption bits for those vmas (except for COW pages). * This is ensured by core vm only modifying these page table entries using * functions that don't touch caching- or encryption bits, using pte_modify() * if needed. (See for example mprotect()). * * Also when new page-table entries are created, this is only done using the * fault() callback, and never using the value of vma->vm_page_prot, * except for page-table entries that point to anonymous pages as the result * of COW. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); /** * vmf_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return the result of this function. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) { if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; if (!vm_mixed_ok(vma, pfn, mkwrite)) return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, bool write) { pgprot_t pgprot = vmf->vma->vm_page_prot; unsigned long addr = vmf->address; int err; if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) return VM_FAULT_SIGBUS; err = insert_page(vmf->vma, addr, page, pgprot, write); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pgd++, addr = next, addr != end); return 0; } /* * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); if (!error) return 0; /* * A partial pfn range mapping is dangerous: it does not * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ zap_page_range_single(vma, addr, size, NULL); return error; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target page aligned user address to start at * @pfn: page frame number of kernel physical memory address * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. * * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int err; err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. * * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return -EINVAL; } arch_enter_lazy_mmu_mode(); if (fn) { do { if (create || !pte_none(ptep_get(pte))) { err = fn(pte, addr, data); if (err) break; } } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int err = 0; BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { pmd = pmd_offset(pud, addr); } do { next = pmd_addr_end(addr, end); if (pmd_none(*pmd) && !create) continue; if (WARN_ON_ONCE(pmd_leaf(*pmd))) return -EINVAL; if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { if (!create) continue; pmd_clear_bad(pmd); } err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { pud = pud_offset(p4d, addr); } do { next = pud_addr_end(addr, end); if (pud_none(*pud) && !create) continue; if (WARN_ON_ONCE(pud_leaf(*pud))) return -EINVAL; if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { if (!create) continue; pud_clear_bad(pud); } err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create, mask); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { p4d = p4d_offset(pgd, addr); } do { next = p4d_addr_end(addr, end); if (p4d_none(*p4d) && !create) continue; if (WARN_ON_ONCE(p4d_leaf(*p4d))) return -EINVAL; if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { if (!create) continue; p4d_clear_bad(p4d); } err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create, mask); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { pgd_t *pgd; unsigned long start = addr, next; unsigned long end = addr + size; pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; if (WARN_ON_ONCE(pgd_leaf(*pgd))) { err = -EINVAL; break; } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; pgd_clear_bad(pgd); } err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, start + size); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. * * Unlike apply_to_page_range, this does _not_ fill in page tables * where they are absent. */ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, false); } /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); vmf->pte = NULL; return same; } /* * Return: * 0: copied succeeded * -EHWPOISON: copy failed due to hwpoison in source page * -EAGAIN: copied failed (some other reason) */ static inline int __wp_page_copy_user(struct page *dst, struct page *src, struct vm_fault *vmf) { int ret; void *kaddr; void __user *uaddr; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; if (likely(src)) { if (copy_mc_user_highpage(dst, src, addr, vma)) return -EHWPOISON; return 0; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_local_page(dst); pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } /* * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = 0; pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); pagefault_enable(); kunmap_local(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; if (vmf->vma->vm_file && IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) return VM_FAULT_SIGBUS; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { folio_lock(folio); if (!folio->mapping) { folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = folio_mark_dirty(folio); VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* * Take a local copy of the address_space - folio.mapping may be zeroed * by truncate after folio_unlock(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = folio_raw_mapping(folio); folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); /* * Throttle page dirtying rate down to writeback speed. * * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { struct file *fpin; fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); return VM_FAULT_COMPLETED; } } return 0; } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && !PageAnonExclusive(vmf->page)); /* * Clear the folio's cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } /* * We could add a bitflag somewhere, but for now, we know that all * vm_ops that have a ->map_pages have been audited and don't need * the mmap_lock to be held. */ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) return 0; vma_end_read(vma); return VM_FAULT_RETRY; } /** * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a * fault handler, call this function rather than anon_vma_prepare(). * If this vma does not already have an associated anon_vma and we are * only protected by the per-VMA lock, the caller must retry with the * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to * determine if this VMA can share its anon_vma, and that's not safe to * do with only the per-VMA lock held for this VMA. * * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; if (vmf->flags & FAULT_FLAG_VMA_LOCK) mmap_read_unlock(vma->vm_mm); return ret; } /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct folio *old_folio = NULL; struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; bool pfn_is_zero; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); ret = vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); if (!new_folio) goto oom; if (!pfn_is_zero) { int err; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. * The -EHWPOISON case will not be retried. */ folio_put(new_folio); if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) entry = pte_mksoft_dirty(entry); if (pte_uffd_wp(vmf->orig_pte)) entry = pte_mkuffd_wp(entry); } else { entry = maybe_mkwrite(pte_mkdirty(entry), vma); } /* * Clear the pte entry and flush it first, before updating the * pte with the new entry, to keep TLBs on different CPUs in * sync. This code used to set the new PTE then flush TLBs, but * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(old_folio); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf, folio); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf_can_call_fault(vmf); if (ret) return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf, NULL); } wp_page_reuse(vmf, NULL); return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = vmf_can_call_fault(vmf); if (tmp) { folio_put(folio); return tmp; } tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); folio_put(folio); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { bool exclusive = false; /* Let's just free up a large folio if only a single page is mapped. */ if (folio_large_mapcount(folio) <= 1) return false; /* * The assumption for anonymous folios is that each page can only get * mapped once into each MM. The only exception are KSM folios, which * are always small. * * Each taken mapcount must be paired with exactly one taken reference, * whereby the refcount must be incremented before the mapcount when * mapping a page, and the refcount must be decremented after the * mapcount when unmapping a page. * * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ if (folio_test_large_maybe_mapped_shared(folio)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); if (unlikely(folio_test_swapcache(folio))) { /* * Note: freeing up the swapcache will fail if some PTEs are * still swap entries. */ if (!folio_trylock(folio)) return false; folio_free_swap(folio); folio_unlock(folio); } if (folio_large_mapcount(folio) != folio_ref_count(folio)) return false; /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); if (folio_test_large_maybe_mapped_shared(folio)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); /* * Do we need the folio lock? Likely not. If there would have been * references from page migration/swapout, we would have detected * an additional folio reference and never ended up here. */ exclusive = true; unlock: folio_unlock_large_mapcount(folio); return exclusive; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) return __wp_can_reuse_large_anon_folio(folio, vma); /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * * KSM doesn't necessarily raise the folio refcount. */ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) return false; if (!folio_test_lru(folio)) /* * We cannot easily detect+handle references from * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) return false; if (!folio_trylock(folio)) return false; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { folio_unlock(folio); return false; } /* * Ok, we've got the only folio reference from our mapping * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ folio_move_anon_rmap(folio, vma); folio_unlock(folio); return true; } /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) * * GUP wants to take a R/O pin on a possibly shared anonymous page * (FAULT_FLAG_UNSHARE) * * It is done by copying the page to a new address and decrementing the * shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've * done any necessary COW. * * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { if (!userfaultfd_wp_async(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } /* * Nothing needed (cache flush, TLB invalidations, * etc.) because we're only removing the uffd-wp bit, * which is completely invisible to the user. */ pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* * Update this to be prepared for following up CoW * handling */ vmf->orig_pte = pte; } /* * Userfaultfd write-protect can defer flushes. Ensure the TLB * is flushed in this case before copying. */ if (unlikely(userfaultfd_wp(vmf->vma) && mm_tlb_flush_pending(vmf->vma->vm_mm))) flush_tlb_page(vmf->vma, vmf->address); } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (vmf->page) folio = page_folio(vmf->page); /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. */ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if (!vmf->page || is_fsdax_page(vmf->page)) { vmf->page = NULL; return wp_pfn_shared(vmf); } return wp_page_shared(vmf, folio); } /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. * * If we encounter a page that is marked exclusive, we must reuse * the page without further checks. */ if (folio && folio_test_anon(folio) && (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { if (!PageAnonExclusive(vmf->page)) SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } wp_page_reuse(vmf, folio); return 0; } /* * Ok, we need to copy. Oh, well.. */ if (folio) folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = max(first_index, vba); zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_folio() - Unmap single folio from processes. * @folio: The locked folio to be unmapped. * * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a folio, it may find that * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ void unmap_mapping_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * Restore a potential device exclusive pte to a working pte entry */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold * the PTL so a racing thread can remove the device-exclusive * entry and unmap it. If the folio is free the entry must * have been removed already. If it happens to have already * been re-allocated after being freed all we do is lock and * unlock it. */ if (!folio_try_get(folio)) return 0; ret = folio_lock_or_retry(folio, vmf); if (ret) { folio_put(folio); return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, folio, vmf->page, vmf->address, vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; } static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } static vm_fault_t do_pte_missing(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } /* * This is actually a page-missing access, but with uffd-wp special pte * installed. It means this pte was wr-protected before being unmapped. */ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma))) return pte_marker_clear(vmf); return do_pte_missing(vmf); } static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); unsigned long marker = pte_marker_get(entry); /* * PTE markers should never be empty. If anything weird happened, * the best thing to do is to kill the process along with its mm. */ if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; /* Hitting a guard page is always a fatal condition. */ if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ return VM_FAULT_SIGBUS; } static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; swp_entry_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; entry = pte_to_swp_entry(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); return NULL; } return folio; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); int i; /* * While allocating a large folio and doing swap_read_folio, which is * the case the being faulted pte doesn't have swapcache. We need to * ensure all PTEs have no cache as well, otherwise, we might go to * swap devices while the content is in swapcache. */ for (i = 0; i < max_nr; i++) { if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) return i; } return i; } /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; swp_entry_t entry; int idx; pte_t pte; addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); idx = (vmf->address - addr) / PAGE_SIZE; pte = ptep_get(ptep); if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; entry = pte_to_swp_entry(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) return false; return true; } static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, unsigned long addr, unsigned long orders) { int order, nr; order = highest_order(orders); /* * To swap in a THP with nr pages, we require that its first swap_offset * is aligned with that number, as it was when the THP was swapped out. * This helps filter out most invalid entries. */ while (orders) { nr = 1 << order; if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) break; order = next_order(&orders, order); } return orders; } static struct folio *alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; struct folio *folio; unsigned long addr; swp_entry_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * A large swapped out folio could be partially or fully in zswap. We * lack handling for such cases, so fallback to swapping in order-0 * folio. */ if (!zswap_never_enabled()) goto fallback; entry = pte_to_swp_entry(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) goto fallback; /* * For do_swap_page, find the highest order where the aligned range is * completely swap entries with contiguous swap offsets. */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap_unlock(pte, ptl); /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } fallback: return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_exclusive_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate * under VMA lock. */ vma_end_read(vma); ret = VM_FAULT_RETRY; goto out; } vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto unlock; /* * Get a page reference while we know the page can't be * freed. */ if (trylock_page(vmf->page)) { struct dev_pagemap *pgmap; get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); pgmap = page_pgmap(vmf->page); ret = pgmap->ops->migrate_to_ram(vmf); unlock_page(vmf->page); put_page(vmf->page); } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } /* Prevent swapoff from happening to us. */ si = get_swap_device(entry); if (unlikely(!si)) goto out; folio = swap_cache_get_folio(entry, vma, vmf->address); if (folio) page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); nr_pages = folio_nr_pages(folio); if (folio_test_large(folio)) entry.val = ALIGN_DOWN(entry.val, nr_pages); /* * Prevent parallel swapin from proceeding with * the cache flag. Otherwise, another thread * may finish swapin first, free the entry, and * swapout reusing the same entry. It's * undetectable as pte_same() returns true due * to entry reuse. */ if (swapcache_prepare(entry, nr_pages)) { /* * Relax a bit to prevent rapid * repeated page faults. */ add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(folio, shadow); folio_add_lru(folio); /* To provide entry to swap_read_folio() */ folio->swap = entry; swap_read_folio(folio, NULL); folio->private = NULL; } } else { folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = folio; } if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not * changed. */ if (unlikely(!folio_test_swapcache(folio) || page_swap_entry(page).val != entry.val)) goto out_page; /* * KSM sometimes has to copy on read faults, for example, if * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { ret = VM_FAULT_OOM; folio = swapcache; goto out_page; } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; folio = swapcache; goto out_page; } if (folio != swapcache) page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } folio_throttle_swaprate(folio, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* allocated large folios for SWP_SYNCHRONOUS_IO */ if (folio_test_large(folio) && !folio_test_swapcache(folio)) { unsigned long nr = folio_nr_pages(folio); unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; pte_t *folio_ptep = vmf->pte - idx; pte_t folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto out_nomap; page_idx = idx; address = folio_start; ptep = folio_ptep; goto check_folio; } nr_pages = 1; page_idx = 0; address = vmf->address; ptep = vmf->pte; if (folio_test_large(folio) && folio_test_swapcache(folio)) { int nr = folio_nr_pages(folio); unsigned long idx = folio_page_idx(folio, page); unsigned long folio_start = address - idx * PAGE_SIZE; unsigned long folio_end = folio_start + nr * PAGE_SIZE; pte_t *folio_ptep; pte_t folio_pte; if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) goto check_folio; if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) goto check_folio; folio_ptep = vmf->pte - idx; folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto check_folio; page_idx = idx; address = folio_start; ptep = folio_ptep; nr_pages = nr; entry = folio->swap; page = &folio->page; } check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is * PG_anon_exclusive. Sanity check that this holds and especially, that * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. */ exclusive = true; } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support * concurrent page modifications while under writeback. * * So if we stumble over such a page in the swapcache * we must not set the page exclusive, otherwise we can * map it writable without further checks and modify it * while still under writeback. * * For these problematic swap backends, simply drop the * exclusive marker: this is perfectly fine as we start * writeback only if we fully unmapped the page and * there are no unexpected references on the page after * unmapping succeeded. After fully unmapped, no * further GUP references (FOLL_GET and FOLL_PIN) can * appear, so dropping the exclusive marker and mapping * it only R/O is fine. */ exclusive = false; } } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it * yet. */ swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are * certainly not shared either because we just allocated them without * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && !pte_needs_soft_dirty_wp(vma, pte)) { pte = pte_mkwrite(pte, vma); if (vmf->flags & FAULT_FLAG_WRITE) { pte = pte_mkdirty(pte); vmf->flags &= ~FAULT_FLAG_WRITE; } } rmap_flags |= RMAP_EXCLUSIVE; } folio_ref_add(folio, nr_pages - 1); flush_icache_pages(vma, page, nr_pages); vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* * We currently only expect small !anon folios which are either * fully exclusive or fully shared, or new allocated large * folios which are fully exclusive. If we ever get large * folios within swapcache here, we have to be careful. */ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ folio_unlock(swapcache); folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; out_nomap: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: folio_put(folio); if (folio != swapcache && swapcache) { folio_unlock(swapcache); folio_put(swapcache); } if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; } static bool pte_range_none(pte_t *pte, int nr_pages) { int i; for (i = 0; i < nr_pages; i++) { if (!pte_none(ptep_get_lockless(pte + i))) return false; } return true; } static struct folio *alloc_anon_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); if (!pte) return ERR_PTR(-EAGAIN); /* * Find the highest order where the aligned range is completely * pte_none(). Note that all remaining orders will be completely * pte_none(). */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (pte_range_none(pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap(pte); if (!orders) goto fallback; /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation * (__GFP_ZERO not used) or user folios require special * handling, folio_zero_user() is used to make sure * that the page corresponding to the faulting address * will be hot in the cache after zeroing. */ if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } fallback: #endif return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(), so that OOM can * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ ret = vmf_anon_prepare(vmf); if (ret) return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) return 0; if (!folio) goto oom; nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; if (nr_pages == 1 && vmf_pte_changed(vmf)) { update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); goto unlock; oom: return VM_FAULT_OOM; } /* * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_one * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { if (page_mapped(vmf->page)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; folio_unlock(folio); } folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) folio_lock(folio); else VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; /* * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise * the corrupted page may mapped by PMD silently to escape the * check. This kind of THP just can be PTE mapped. Access to * the corrupted subpage should trigger SIGBUS as expected. */ if (unlikely(folio_test_has_hwpoisoned(folio))) return ret; /* * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { return VM_FAULT_FALLBACK; } #endif /** * set_pte_range - Set a range of PTEs to point to pages in a folio. * @vmf: Fault decription. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. * @addr: The first address to create a PTE for. */ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); else entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); return !pte_none(ptep_get(vmf->pte)); } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; unsigned long addr; bool needs_fallback = false; fallback: addr = vmf->address; /* Did we COW the page? */ if (is_cow) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vma->vm_flags & VM_SHARED)) { ret = check_stable_address_space(vma->vm_mm); if (ret) return ret; } if (pmd_none(*vmf->pmd)) { if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (vmf->prealloc_pte) pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; } folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; /* The index of the entry in the pagetable for fault page. */ pgoff_t pte_off = pte_index(vmf->address); /* * Fallback to per-page fault in case the folio size in page * cache beyond the VMA limits and PMD pagetable limits. */ if (unlikely(vma_off < idx || vma_off + (nr_pages - idx) > vma_pages(vma) || pte_off < idx || pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { nr_pages = 1; } else { /* Now we can set mappings for the whole large folio. */ addr = vmf->address - idx * PAGE_SIZE; page = &folio->page; } } vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { needs_fallback = true; pte_unmap_unlock(vmf->pte, vmf->ptl); goto fallback; } folio_ref_add(folio, nr_pages - 1); set_pte_range(vmf, folio, page, nr_pages, addr); type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); add_mm_counter(vma->vm_mm, type, nr_pages); ret = 0; unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_pages __read_mostly = 65536 >> PAGE_SHIFT; #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_pages << PAGE_SHIFT; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; /* * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ val = max(val, PAGE_SIZE); fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function doesn't cross VMA or page table boundaries, in order to call * map_pages() and acquire a PTE lock only once. * * fault_around_pages defines how many pages we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_pages * PAGE_SIZE rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { pgoff_t nr_pages = READ_ONCE(fault_around_pages); pgoff_t pte_off = pte_index(vmf->address); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; pgoff_t from_pte, to_pte; vm_fault_t ret; /* The PTE offset of the start address, clamped to the VMA. */ from_pte = max(ALIGN_DOWN(pte_off, nr_pages), pte_off - min(pte_off, vma_off)); /* The PTE offset of the end address, clamped to the VMA and PTE. */ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE, pte_off + vma_pages(vmf->vma) - vma_off) - 1; if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } rcu_read_lock(); ret = vmf->vma->vm_ops->map_pages(vmf, vmf->pgoff + from_pte - pte_off, vmf->pgoff + to_pte - pte_off); rcu_read_unlock(); return ret; } /* Return true if we should do read fault-around, false otherwise */ static inline bool should_fault_around(struct vm_fault *vmf) { /* No ->map_pages? No way to fault around... */ if (!vmf->vma->vm_ops->map_pages) return false; if (uffd_disable_fault_around(vmf->vma)) return false; /* A single page implies no faulting 'around' at all. */ return fault_around_pages > 1; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (should_fault_around(vmf)) { ret = do_fault_around(vmf); if (ret) return ret; } ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); folio = page_folio(vmf->page); folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) folio_put(folio); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); if (!ret) ret = vmf_anon_prepare(vmf); if (ret) return ret; folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); if (!folio) return VM_FAULT_OOM; vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { ret = VM_FAULT_HWPOISON; goto unlock; } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: folio_put(folio); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; struct folio *folio; ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; folio = page_folio(vmf->page); /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } } ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { folio_unlock(folio); folio_put(folio); return ret; } ret |= fault_dirty_shared_page(vmf); return ret; } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update * of pte involves: take ptl, clearing the pte so that * we don't have concurrent modification by hardware * followed by an update. */ if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); else ret = do_shared_fault(vmf); /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; } int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses * the case where a mapping is writable but the process never writes * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ if (!writable) *flags |= TNF_NO_GROUP; /* * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if (folio_use_access_time(folio)) *last_cpupid = (-1 & LAST_CPUPID_MASK); else *last_cpupid = folio_last_cpupid(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); #ifdef CONFIG_NUMA_BALANCING count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); #endif if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } return mpol_misplaced(folio, vmf, addr); } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); } static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, struct folio *folio, pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); unsigned long start, end, addr = vmf->address; unsigned long addr_start = addr - (nr << PAGE_SHIFT); unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); pte_t *start_ptep; /* Stay within the VMA and within the page table. */ start = max3(addr_start, pt_start, vma->vm_start); end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, vma->vm_end); start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(start_ptep); bool writable = false; if (!pte_present(ptent) || !pte_protnone(ptent)) continue; if (pfn_folio(pte_pfn(ptent)) != folio) continue; if (!ignore_writable) { ptent = pte_modify(ptent, vma->vm_page_prot); writable = pte_write(ptent); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, addr, ptent)) writable = true; } numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); } } static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; bool writable = false, ignore_writable = false; bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); /* Read the live PTE from the page tables: */ old_pte = ptep_get(vmf->pte); if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); /* * Detect now whether the PTE could be writable; this information * is only valid while holding the PT lock. */ writable = pte_write(pte); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; folio = vm_normal_folio(vma, vmf->address, pte); if (!folio || folio_is_zone_device(folio)) goto out_map; nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { flags |= TNF_MIGRATE_FAIL; goto out_map; } /* The folio is isolated and isolation code holds a folio reference. */ pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; /* Migrate to the requested node */ if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } flags |= TNF_MIGRATE_FAIL; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ if (folio && folio_test_large(folio)) numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, pte_write_upgrade); else numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; if (vma_is_anonymous(vma)) { if (likely(!unshare) && userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { if (userfaultfd_wp_async(vmf->vma)) goto split; return handle_userfault(vmf, VM_UFFD_WP); } return do_huge_pmd_wp_page(vmf); } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { pmd_t dummy_pmdval; /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. * * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck * the pmdval. Here we chooes to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &dummy_pmdval, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte) return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); else if (likely(vmf->flags & FAULT_FLAG_WRITE)) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) goto unlock; /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (vmf->flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, vmf->pte); } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * On entry, we hold either the VMA lock or the mmap_lock * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in * the result, the mmap_lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pud_t orig_pud = *vmf.pud; barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and * FAULT_FLAG_UNSHARE handling. */ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pud_set_accessed(&vmf, orig_pud); return 0; } } } vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; /* Huge pud page fault raced with pmd_alloc? */ if (pud_trans_unstable(vmf.pud)) goto retry_pud; if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(&vmf); return 0; } } } return handle_pte_fault(&vmf); } /** * mm_account_fault - Do page fault accounting * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. * @address: the faulted address. * @flags: the fault flags. * @ret: the fault retcode. * * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs, unsigned long address, unsigned int flags, vm_fault_t ret) { bool major; /* Incomplete faults will be accounted upon completion. */ if (ret & VM_FAULT_RETRY) return; /* * To preserve the behavior of older kernels, PGFAULT counters record * both successful and failed faults, as opposed to perf counters, * which ignore failed cases. */ count_vm_event(PGFAULT); count_memcg_event_mm(mm, PGFAULT); /* * Do not account for unsuccessful faults (e.g. when the address wasn't * valid). That includes arch_vma_access_permitted() failing before * reaching here. So this is not a "this many hardware page faults" * counter. We should use the hw profiling for that. */ if (ret & VM_FAULT_ERROR) return; /* * We define the fault as a major fault when the final successful fault * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't * handle it immediately previously). */ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); if (major) current->maj_flt++; else current->min_flt++; /* * If the fault is done for GUP, regs will be NULL. We only do the * accounting for the per thread fault counters who triggered the * fault, and we skip the perf event updates. */ if (!regs) return; if (major) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { /* the LRU algorithm only applies to accesses with recency */ current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) { current->in_lru_fault = false; } #else static void lru_gen_enter_fault(struct vm_area_struct *vma) { } static void lru_gen_exit_fault(void) { } #endif /* CONFIG_LRU_GEN */ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, unsigned int *flags) { if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) return VM_FAULT_SIGSEGV; /* * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's * just treat it like an ordinary read-fault otherwise. */ if (!is_cow_mapping(vma->vm_flags)) *flags &= ~FAULT_FLAG_UNSHARE; } else if (*flags & FAULT_FLAG_WRITE) { /* Write faults on read-only mappings are impossible ... */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) return VM_FAULT_SIGSEGV; /* ... and FOLL_FORCE only applies to COW mappings. */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } #ifdef CONFIG_PER_VMA_LOCK /* * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of * the assumption that lock is dropped on VM_FAULT_RETRY. */ if (WARN_ON_ONCE((*flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) return VM_FAULT_SIGSEGV; #endif return 0; } /* * By the time we get here, we already hold either the VMA lock or the * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; bool is_droppable; __set_current_state(TASK_RUNNING); ret = sanitize_fault_flags(vma, &flags); if (ret) goto out; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) { ret = VM_FAULT_SIGSEGV; goto out; } is_droppable = !!(vma->vm_flags & VM_DROPPABLE); /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); lru_gen_enter_fault(vma); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); /* * Warning: It is no longer safe to dereference vma-> after this point, * because mmap_lock might have been dropped by __handle_mm_fault(), so * vma might be destroyed from underneath us. */ lru_gen_exit_fault(); /* If the mapping is droppable, then errors due to OOM aren't fatal. */ if (is_droppable) ret &= ~VM_FAULT_OOM; if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no * VM_FAULT_OOM), there is no need to kill anything. * Just clean up the OOM state peacefully. */ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) mem_cgroup_oom_synchronize(false); } out: mm_account_fault(mm, regs, address, flags, ret); return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); #ifdef CONFIG_LOCK_MM_AND_FIND_VMA #include <linux/extable.h> static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { if (likely(mmap_read_trylock(mm))) return true; if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { /* * We don't have this operation yet. * * It should be easy enough to do: it's basically a * atomic_long_try_cmpxchg_acquire() * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but * it also needs the proper lockdep magic etc. */ return false; } static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_write_lock_killable(mm); } /* * Helper for page fault handling. * * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * * For example, if we have a kernel bug that causes a page * fault, we don't want to just use mmap_read_lock() to get * the mm lock, because that would deadlock if the bug were * to happen while we're holding the mm lock for writing. * * So this checks the exception tables on kernel faults in * order to only do this all for instructions that are actually * expected to fault. * * We can also actually take the mm lock for writing if we * need to extend the vma, which helps the VM layer a lot. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; if (!get_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (likely(vma && (vma->vm_start <= addr))) return vma; /* * Well, dang. We might still be successful, but only * if we can extend a vma to do so. */ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { mmap_read_unlock(mm); return NULL; } /* * We can try to upgrade the mmap lock atomically, * in which case we can continue to use the vma * we already looked up. * * Otherwise we'll have to drop the mmap lock and * re-take it, and also look up the vma again, * re-checking it. */ if (!mmap_upgrade_trylock(mm)) { if (!upgrade_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (!vma) goto fail; if (vma->vm_start <= addr) goto success; if (!(vma->vm_flags & VM_GROWSDOWN)) goto fail; } if (expand_stack_locked(vma, addr)) goto fail; success: mmap_write_downgrade(mm); return vma; fail: mmap_write_unlock(mm); return NULL; } #endif #ifdef CONFIG_PER_VMA_LOCK static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) { unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) return false; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, TASK_UNINTERRUPTIBLE); lock_acquired(&vma->vmlock_dep_map, _RET_IP_); return true; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { bool locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ locked = __vma_enter_locked(vma, false); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } } EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_attached(vma); /* * We are the only writer, so no need to use vma_refcount_put(). * The condition below is unlikely because the vma has been already * write-locked and readers can increment vm_refcnt only temporarily * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true)) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } } } /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. */ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; rcu_read_lock(); retry: vma = mas_walk(&mas); if (!vma) goto inval; vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ goto retry; } /* Failed to lock the VMA */ goto inval; } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ /* Check if the vma we locked is the right one. */ if (unlikely(vma->vm_mm != mm || address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); return vma; inval_end_read: vma_end_read(vma); inval: rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } #endif /* CONFIG_PER_VMA_LOCK */ #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. * We've already handled the fast-path in-line. */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * We've already handled the fast-path in-line. */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, spinlock_t *lock, pte_t *ptep, pgprot_t pgprot, unsigned long pfn_base, unsigned long addr_mask, bool writable, bool special) { args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); args->addr_mask = addr_mask; args->pgprot = pgprot; args->writable = writable; args->special = special; } static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP struct file *file = vma->vm_file; struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || lockdep_is_held(&vma->vm_mm->mmap_lock)); else lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); #endif } /** * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address * @args: Pointer to struct @follow_pfnmap_args * * The caller needs to setup args->vma and args->address to point to the * virtual address as the target of such lookup. On a successful return, * the results will be put into other output fields. * * After the caller finished using the fields, the caller must invoke * another follow_pfnmap_end() to proper releases the locks and resources * of such look up request. * * During the start() and end() calls, the results in @args will be valid * as proper locks will be held. After the end() is called, all the fields * in @follow_pfnmap_args will be invalid to be further accessed. Further * use of such information after end() may require proper synchronizations * by the caller with page table updates, otherwise it can create a * security bug. * * If the PTE maps a refcounted page, callers are responsible to protect * against invalidation with MMU notifiers; otherwise access to the PFN at * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read, and the mmap semaphore cannot be released * before the end() is invoked. * * This function must not be used to modify PTE content. * * Return: zero on success, negative otherwise. */ int follow_pfnmap_start(struct follow_pfnmap_args *args) { struct vm_area_struct *vma = args->vma; unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; spinlock_t *lock; pgd_t *pgdp; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pfnmap_lockdep_assert(vma); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; retry: pgdp = pgd_offset(mm, address); if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); if (!unlikely(pud_leaf(pud))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), pud_pfn(pud), PUD_MASK, pud_write(pud), pud_special(pud)); return 0; } pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); if (!unlikely(pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), pmd_special(pmd)); return 0; } ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; pte = ptep_get(ptep); if (!pte_present(pte)) goto unlock; pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), pte_pfn(pte), PAGE_MASK, pte_write(pte), pte_special(pte)); return 0; unlock: pte_unmap_unlock(ptep, lock); out: return -EINVAL; } EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** * follow_pfnmap_end(): End a follow_pfnmap_start() process * @args: Pointer to struct @follow_pfnmap_args * * Must be used in pair of follow_pfnmap_start(). See the start() function * above for more information. */ void follow_pfnmap_end(struct follow_pfnmap_args *args) { if (args->lock) spin_unlock(args->lock); if (args->ptep) pte_unmap(args->ptep); } EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading * * This is a generic implementation for &vm_operations_struct.access for an * iomem mapping. This callback is used by access_process_vm() when the @vma is * not page based. */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; bool writable; struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: if (follow_pfnmap_start(&args)) return -EINVAL; prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; if (follow_pfnmap_start(&args)) goto out_unmap; if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); iounmap(maddr); goto retry; } if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); ret = len; follow_pfnmap_end(&args); out_unmap: iounmap(maddr); return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* * Access another process' address space as given in mm. */ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (mmap_read_lock_killable(mm)) return 0; /* Untag the address before looking up the VMA */ addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, offset; void *maddr; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { vma = expand_stack(mm, addr); /* mmap_lock was dropped on failure */ if (!vma) return buf - old_buf; /* Try again if stack expansion worked */ continue; } /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ bytes = 0; #ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) bytes = vma->vm_ops->access(vma, addr, buf, len, write); #endif if (bytes <= 0) break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap_local_page(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } unmap_and_put_page(page, maddr); } len -= bytes; buf += bytes; addr += bytes; } mmap_read_unlock(mm); return buf - old_buf; } /** * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(access_process_vm); #ifdef CONFIG_BPF_SYSCALL /* * Copy a string from another process's address space as given in mm. * If there is any error return -EFAULT. */ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int err = 0; *(char *)buf = '\0'; if (mmap_read_lock_killable(mm)) return -EFAULT; addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr)) { err = -EFAULT; goto out; } while (len) { int bytes, offset, retval; void *maddr; struct page *page; struct vm_area_struct *vma = NULL; page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* * Treat as a total failure for now until we decide how * to handle the CONFIG_HAVE_IOREMAP_PROT case and * stack expansion. */ *(char *)buf = '\0'; err = -EFAULT; goto out; } bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; maddr = kmap_local_page(page); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; unmap_and_put_page(page, maddr); break; } buf += bytes - 1; /* * Because strscpy always NUL terminates we need to * copy the last byte in the page if we are going to * load more pages */ if (bytes != len) { addr += bytes - 1; copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); buf += 1; addr += 1; } len -= bytes; unmap_and_put_page(page, maddr); } out: mmap_read_unlock(mm); if (err) return err; return buf - old_buf; } /** * copy_remote_vm_str - copy a string from another process's address space. * @tsk: the task of the target address space * @addr: start address to read from * @buf: destination buffer * @len: number of bytes to copy * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from @addr (source) to @buf (destination); * not including the trailing NUL. Always guaranteed to leave NUL-terminated * buffer. On any error, return -EFAULT. */ int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; if (unlikely(len == 0)) return 0; mm = get_task_mm(tsk); if (!mm) { *(char *)buf = '\0'; return -EFAULT; } ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(copy_remote_vm_str); #endif /* CONFIG_BPF_SYSCALL */ /* * Print the name of a VMA. */ void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; /* * we might be running from an atomic context so we cannot sleep */ if (!mmap_read_trylock(mm)) return; vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; ip -= vma->vm_start; ip += vma->vm_pgoff << PAGE_SHIFT; printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, vma->vm_start, vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { if (pagefault_disabled()) return; __might_sleep(file, line); if (current->mm) might_lock_read(&current->mm->mmap_lock); } EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) /* * Process all subpages of the specified huge page with the specified * operation. The target subpage will be processed last to keep its * cache lines hot. */ static inline int process_huge_page( unsigned long addr_hint, unsigned int nr_pages, int (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l, ret; unsigned long addr = addr_hint & ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= nr_pages) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ for (i = nr_pages - 1; i >= 2 * n; i--) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } else { /* If target subpage in second half of huge page */ base = nr_pages - 2 * (nr_pages - n); l = nr_pages - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } /* * Process remaining subpages in left-right-left-right pattern * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); if (ret) return ret; cond_resched(); ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); if (ret) return ret; } return 0; } static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); for (i = 0; i < nr_pages; i++) { cond_resched(); clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { struct folio *folio = arg; clear_user_highpage(folio_page(folio, idx), addr); return 0; } /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. * @addr_hint: The address will be accessed or the base address if uncelar. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { unsigned int nr_pages = folio_nr_pages(folio); if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) clear_gigantic_page(folio, addr_hint, nr_pages); else process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); src_page = folio_page(src, i); cond_resched(); if (copy_mc_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma)) return -EHWPOISON; } return 0; } struct copy_subpage_arg { struct folio *dst; struct folio *src; struct vm_area_struct *vma; }; static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; struct page *dst = folio_page(copy_arg->dst, idx); struct page *src = folio_page(copy_arg->src, idx); if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) return -EHWPOISON; return 0; } int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma) { unsigned int nr_pages = folio_nr_pages(dst); struct copy_subpage_arg arg = { .dst = dst, .src = src, .vma = vma, }; if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages); return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg); } long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault) { void *kaddr; unsigned long i, rc = 0; unsigned int nr_pages = folio_nr_pages(dst_folio); unsigned long ret_val = nr_pages * PAGE_SIZE; struct page *subpage; for (i = 0; i < nr_pages; i++) { subpage = folio_page(dst_folio, i); kaddr = kmap_local_page(subpage); if (!allow_pagefault) pagefault_disable(); rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE); if (!allow_pagefault) pagefault_enable(); kunmap_local(kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) break; flush_dcache_page(subpage); cond_resched(); } return ret_val; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; void __init ptlock_cache_init(void) { page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, SLAB_PANIC, NULL); } bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; ptdesc->ptl = ptl; return true; } void ptlock_free(struct ptdesc *ptdesc) { if (ptdesc->ptl) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif void vma_pgtable_walk_begin(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_lock_read(vma); } void vma_pgtable_walk_end(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); }
3 3 1 4 1 8 27 2 25 1 7 17 9 5 9 2 18 2 2 12 2 2 6 14 13 3 10 3 1 3 4 5 3 1 5 5 5 4 1 4 4 3 1 1 3 2 1 1 5 4 4 1 1 1 23 9 1 2 2 2 3 1 2 2 1 1 1 1 1 4 3 1 1 9 1 7 4 2 3 1 5 3 2 2 3 2 3 2 3 3 2 44 1 17 1 6 3 2 1 2 1 2 1 1 2 3 1 1 5 2 2 1 1 1 1 1 3 2 1 1 3 2 1 1 1 34 18 1 17 9 8 17 12 12 12 38 38 43 1 42 37 5 3 39 40 39 1 42 42 42 4 38 38 38 40 39 1 38 38 38 23 14 4 5 14 9 23 23 25 25 24 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org> Copyright (C) 2010 Google Inc. Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth L2CAP sockets. */ #include <linux/module.h> #include <linux/export.h> #include <linux/filter.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include "smp.h" static struct bt_sock_list l2cap_sk_list = { .lock = __RW_LOCK_UNLOCKED(l2cap_sk_list.lock) }; static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern); static void l2cap_sock_cleanup_listen(struct sock *parent); bool l2cap_is_socket(struct socket *sock) { return sock && sock->ops == &l2cap_sock_ops; } EXPORT_SYMBOL(l2cap_is_socket); static int l2cap_validate_bredr_psm(u16 psm) { /* PSM must be odd and lsb of upper byte must be 0 */ if ((psm & 0x0101) != 0x0001) return -EINVAL; /* Restrict usage of well-known PSMs */ if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; } static int l2cap_validate_le_psm(u16 psm) { /* Valid LE_PSM ranges are defined only until 0x00ff */ if (psm > L2CAP_PSM_LE_DYN_END) return -EINVAL; /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */ if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; } static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct sockaddr_l2 la; int len, err = 0; BT_DBG("sk %p", sk); if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; memset(&la, 0, sizeof(la)); len = min_t(unsigned int, sizeof(la), alen); memcpy(&la, addr, len); if (la.l2_cid && la.l2_psm) return -EINVAL; if (!bdaddr_type_is_valid(la.l2_bdaddr_type)) return -EINVAL; if (bdaddr_type_is_le(la.l2_bdaddr_type)) { /* We only allow ATT user space socket */ if (la.l2_cid && la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } lock_sock(sk); if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (la.l2_psm) { __u16 psm = __le16_to_cpu(la.l2_psm); if (la.l2_bdaddr_type == BDADDR_BREDR) err = l2cap_validate_bredr_psm(psm); else err = l2cap_validate_le_psm(psm); if (err) goto done; } bacpy(&chan->src, &la.l2_bdaddr); chan->src_type = la.l2_bdaddr_type; if (la.l2_cid) err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid)); else err = l2cap_add_psm(chan, &la.l2_bdaddr, la.l2_psm); if (err < 0) goto done; switch (chan->chan_type) { case L2CAP_CHAN_CONN_LESS: if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_3DSP) chan->sec_level = BT_SECURITY_SDP; break; case L2CAP_CHAN_CONN_ORIENTED: if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_SDP || __le16_to_cpu(la.l2_psm) == L2CAP_PSM_RFCOMM) chan->sec_level = BT_SECURITY_SDP; break; case L2CAP_CHAN_RAW: chan->sec_level = BT_SECURITY_SDP; break; case L2CAP_CHAN_FIXED: /* Fixed channels default to the L2CAP core not holding a * hci_conn reference for them. For fixed channels mapping to * L2CAP sockets we do want to hold a reference so set the * appropriate flag to request it. */ set_bit(FLAG_HOLD_HCI_CONN, &chan->flags); break; } /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set. */ if (chan->psm && bdaddr_type_is_le(chan->src_type) && chan->mode != L2CAP_MODE_EXT_FLOWCTL) chan->mode = L2CAP_MODE_LE_FLOWCTL; chan->state = BT_BOUND; sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct sockaddr_l2 la; int len, err = 0; bool zapped; BT_DBG("sk %p", sk); lock_sock(sk); zapped = sock_flag(sk, SOCK_ZAPPED); release_sock(sk); if (zapped) return -EINVAL; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; memset(&la, 0, sizeof(la)); len = min_t(unsigned int, sizeof(la), alen); memcpy(&la, addr, len); if (la.l2_cid && la.l2_psm) return -EINVAL; if (!bdaddr_type_is_valid(la.l2_bdaddr_type)) return -EINVAL; /* Check that the socket wasn't bound to something that * conflicts with the address given to connect(). If chan->src * is BDADDR_ANY it means bind() was never used, in which case * chan->src_type and la.l2_bdaddr_type do not need to match. */ if (chan->src_type == BDADDR_BREDR && bacmp(&chan->src, BDADDR_ANY) && bdaddr_type_is_le(la.l2_bdaddr_type)) { /* Old user space versions will try to incorrectly bind * the ATT socket using BDADDR_BREDR. We need to accept * this and fix up the source address type only when * both the source CID and destination CID indicate * ATT. Anything else is an invalid combination. */ if (chan->scid != L2CAP_CID_ATT || la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; /* We don't have the hdev available here to make a * better decision on random vs public, but since all * user space versions that exhibit this issue anyway do * not support random local addresses assuming public * here is good enough. */ chan->src_type = BDADDR_LE_PUBLIC; } if (chan->src_type != BDADDR_BREDR && la.l2_bdaddr_type == BDADDR_BREDR) return -EINVAL; if (bdaddr_type_is_le(la.l2_bdaddr_type)) { /* We only allow ATT user space socket */ if (la.l2_cid && la.l2_cid != cpu_to_le16(L2CAP_CID_ATT)) return -EINVAL; } /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set. */ if (chan->psm && bdaddr_type_is_le(chan->src_type) && chan->mode != L2CAP_MODE_EXT_FLOWCTL) chan->mode = L2CAP_MODE_LE_FLOWCTL; err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type, sk->sk_sndtimeo); if (err) return err; lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); release_sock(sk); return err; } static int l2cap_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM) { err = -EINVAL; goto done; } switch (chan->mode) { case L2CAP_MODE_BASIC: case L2CAP_MODE_LE_FLOWCTL: break; case L2CAP_MODE_EXT_FLOWCTL: if (!enable_ecred) { err = -EOPNOTSUPP; goto done; } break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: if (!disable_ertm) break; fallthrough; default: err = -EOPNOTSUPP; goto done; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; /* Listening channels need to use nested locking in order not to * cause lockdep warnings when the created child channels end up * being locked in the same thread as the parent channel. */ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); chan->state = BT_LISTEN; sk->sk_state = BT_LISTEN; done: release_sock(sk); return err; } static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; long timeo; int err = 0; lock_sock_nested(sk, L2CAP_NESTING_PARENT); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } nsk = bt_accept_dequeue(sk, newsock); if (nsk) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock_nested(sk, L2CAP_NESTING_PARENT); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", nsk); done: release_sock(sk); return err; } static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; BT_DBG("sock %p, sk %p", sock, sk); if (peer && sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2 && sk->sk_state != BT_CONFIG) return -ENOTCONN; memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; la->l2_psm = chan->psm; if (peer) { bacpy(&la->l2_bdaddr, &chan->dst); la->l2_cid = cpu_to_le16(chan->dcid); la->l2_bdaddr_type = chan->dst_type; } else { bacpy(&la->l2_bdaddr, &chan->src); la->l2_cid = cpu_to_le16(chan->scid); la->l2_bdaddr_type = chan->src_type; } return sizeof(struct sockaddr_l2); } static int l2cap_get_mode(struct l2cap_chan *chan) { switch (chan->mode) { case L2CAP_MODE_BASIC: return BT_MODE_BASIC; case L2CAP_MODE_ERTM: return BT_MODE_ERTM; case L2CAP_MODE_STREAMING: return BT_MODE_STREAMING; case L2CAP_MODE_LE_FLOWCTL: return BT_MODE_LE_FLOWCTL; case L2CAP_MODE_EXT_FLOWCTL: return BT_MODE_EXT_FLOWCTL; } return -EINVAL; } static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; struct l2cap_conninfo cinfo; int err = 0; size_t len; u32 opt; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case L2CAP_OPTIONS: /* LE sockets should use BT_SNDMTU/BT_RCVMTU, but since * legacy ATT code depends on getsockopt for * L2CAP_OPTIONS we need to let this pass. */ if (bdaddr_type_is_le(chan->src_type) && chan->scid != L2CAP_CID_ATT) { err = -EINVAL; break; } /* Only BR/EDR modes are supported here */ switch (chan->mode) { case L2CAP_MODE_BASIC: case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: break; default: err = -EINVAL; break; } if (err < 0) break; memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; opts.flush_to = chan->flush_to; opts.mode = chan->mode; opts.fcs = chan->fcs; opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; BT_DBG("mode 0x%2.2x", chan->mode); len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *) &opts, len)) err = -EFAULT; break; case L2CAP_LM: switch (chan->sec_level) { case BT_SECURITY_LOW: opt = L2CAP_LM_AUTH; break; case BT_SECURITY_MEDIUM: opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT; break; case BT_SECURITY_HIGH: opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE; break; case BT_SECURITY_FIPS: opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE | L2CAP_LM_FIPS; break; default: opt = 0; break; } if (test_bit(FLAG_ROLE_SWITCH, &chan->flags)) opt |= L2CAP_LM_MASTER; if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) opt |= L2CAP_LM_RELIABLE; if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case L2CAP_CONNINFO: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } memset(&cinfo, 0, sizeof(cinfo)); cinfo.hci_handle = chan->conn->hcon->handle; memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3); len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *) &cinfo, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct bt_security sec; struct bt_power pwr; u32 phys; int len, mode, err = 0; BT_DBG("sk %p", sk); if (level == SOL_L2CAP) return l2cap_sock_getsockopt_old(sock, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_SECURITY: if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && chan->chan_type != L2CAP_CHAN_FIXED && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; break; } memset(&sec, 0, sizeof(sec)); if (chan->conn) { sec.level = chan->conn->hcon->sec_level; if (sk->sk_state == BT_CONNECTED) sec.key_size = chan->conn->hcon->enc_key_size; } else { sec.level = chan->sec_level; } len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) err = -EFAULT; break; case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *) optval)) err = -EFAULT; break; case BT_FLUSHABLE: if (put_user(test_bit(FLAG_FLUSHABLE, &chan->flags), (u32 __user *) optval)) err = -EFAULT; break; case BT_POWER: if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_RAW) { err = -EINVAL; break; } pwr.force_active = test_bit(FLAG_FORCE_ACTIVE, &chan->flags); len = min_t(unsigned int, len, sizeof(pwr)); if (copy_to_user(optval, (char *) &pwr, len)) err = -EFAULT; break; case BT_CHANNEL_POLICY: if (put_user(chan->chan_policy, (u32 __user *) optval)) err = -EFAULT; break; case BT_SNDMTU: if (!bdaddr_type_is_le(chan->src_type)) { err = -EINVAL; break; } if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } if (put_user(chan->omtu, (u16 __user *) optval)) err = -EFAULT; break; case BT_RCVMTU: if (!bdaddr_type_is_le(chan->src_type)) { err = -EINVAL; break; } if (put_user(chan->imtu, (u16 __user *) optval)) err = -EFAULT; break; case BT_PHY: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } phys = hci_conn_get_phy(chan->conn->hcon); if (put_user(phys, (u32 __user *) optval)) err = -EFAULT; break; case BT_MODE: if (!enable_ecred) { err = -ENOPROTOOPT; break; } if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { err = -EINVAL; break; } mode = l2cap_get_mode(chan); if (mode < 0) { err = mode; break; } if (put_user(mode, (u8 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) { switch (chan->scid) { case L2CAP_CID_ATT: if (mtu && mtu < L2CAP_LE_MIN_MTU) return false; break; default: if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU) return false; } return true; } static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; int err = 0; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case L2CAP_OPTIONS: if (bdaddr_type_is_le(chan->src_type)) { err = -EINVAL; break; } if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } opts.imtu = chan->imtu; opts.omtu = chan->omtu; opts.flush_to = chan->flush_to; opts.mode = chan->mode; opts.fcs = chan->fcs; opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; err = copy_safe_from_sockptr(&opts, sizeof(opts), optval, optlen); if (err) break; if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) { err = -EINVAL; break; } if (!l2cap_valid_mtu(chan, opts.imtu)) { err = -EINVAL; break; } /* Only BR/EDR modes are supported here */ switch (opts.mode) { case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: if (!disable_ertm) break; fallthrough; default: err = -EINVAL; break; } if (err < 0) break; chan->mode = opts.mode; BT_DBG("mode 0x%2.2x", chan->mode); chan->imtu = opts.imtu; chan->omtu = opts.omtu; chan->fcs = opts.fcs; chan->max_tx = opts.max_tx; chan->tx_win = opts.txwin_size; chan->flush_to = opts.flush_to; break; case L2CAP_LM: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt & L2CAP_LM_FIPS) { err = -EINVAL; break; } if (opt & L2CAP_LM_AUTH) chan->sec_level = BT_SECURITY_LOW; if (opt & L2CAP_LM_ENCRYPT) chan->sec_level = BT_SECURITY_MEDIUM; if (opt & L2CAP_LM_SECURE) chan->sec_level = BT_SECURITY_HIGH; if (opt & L2CAP_LM_MASTER) set_bit(FLAG_ROLE_SWITCH, &chan->flags); else clear_bit(FLAG_ROLE_SWITCH, &chan->flags); if (opt & L2CAP_LM_RELIABLE) set_bit(FLAG_FORCE_RELIABLE, &chan->flags); else clear_bit(FLAG_FORCE_RELIABLE, &chan->flags); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) { switch (mode) { case BT_MODE_BASIC: if (bdaddr_type_is_le(chan->src_type)) return -EINVAL; mode = L2CAP_MODE_BASIC; clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; case BT_MODE_ERTM: if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) return -EINVAL; mode = L2CAP_MODE_ERTM; break; case BT_MODE_STREAMING: if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) return -EINVAL; mode = L2CAP_MODE_STREAMING; break; case BT_MODE_LE_FLOWCTL: if (!bdaddr_type_is_le(chan->src_type)) return -EINVAL; mode = L2CAP_MODE_LE_FLOWCTL; break; case BT_MODE_EXT_FLOWCTL: /* TODO: Add support for ECRED PDUs to BR/EDR */ if (!bdaddr_type_is_le(chan->src_type)) return -EINVAL; mode = L2CAP_MODE_EXT_FLOWCTL; break; default: return -EINVAL; } chan->mode = mode; return 0; } static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct bt_security sec; struct bt_power pwr; struct l2cap_conn *conn; int err = 0; u32 opt; u16 mtu; u8 mode; BT_DBG("sk %p", sk); if (level == SOL_L2CAP) return l2cap_sock_setsockopt_old(sock, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SECURITY: if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && chan->chan_type != L2CAP_CHAN_FIXED && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; break; } sec.level = BT_SECURITY_LOW; err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen); if (err) break; if (sec.level < BT_SECURITY_LOW || sec.level > BT_SECURITY_FIPS) { err = -EINVAL; break; } chan->sec_level = sec.level; if (!chan->conn) break; conn = chan->conn; /* change security for LE channels */ if (chan->scid == L2CAP_CID_ATT) { if (smp_conn_security(conn->hcon, sec.level)) { err = -EINVAL; break; } set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; /* or for ACL link */ } else if ((sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) || sk->sk_state == BT_CONNECTED) { if (!l2cap_chan_check_security(chan, true)) set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); else sk->sk_state_change(sk); } else { err = -EINVAL; } break; case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) { set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); set_bit(FLAG_DEFER_SETUP, &chan->flags); } else { clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); clear_bit(FLAG_DEFER_SETUP, &chan->flags); } break; case BT_FLUSHABLE: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > BT_FLUSHABLE_ON) { err = -EINVAL; break; } if (opt == BT_FLUSHABLE_OFF) { conn = chan->conn; /* proceed further only when we have l2cap_conn and No Flush support in the LM */ if (!conn || !lmp_no_flush_capable(conn->hcon->hdev)) { err = -EINVAL; break; } } if (opt) set_bit(FLAG_FLUSHABLE, &chan->flags); else clear_bit(FLAG_FLUSHABLE, &chan->flags); break; case BT_POWER: if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; break; } pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; err = copy_safe_from_sockptr(&pwr, sizeof(pwr), optval, optlen); if (err) break; if (pwr.force_active) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); else clear_bit(FLAG_FORCE_ACTIVE, &chan->flags); break; case BT_CHANNEL_POLICY: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; err = -EOPNOTSUPP; break; case BT_SNDMTU: if (!bdaddr_type_is_le(chan->src_type)) { err = -EINVAL; break; } /* Setting is not supported as it's the remote side that * decides this. */ err = -EPERM; break; case BT_RCVMTU: if (!bdaddr_type_is_le(chan->src_type)) { err = -EINVAL; break; } if (chan->mode == L2CAP_MODE_LE_FLOWCTL && sk->sk_state == BT_CONNECTED) { err = -EISCONN; break; } err = copy_safe_from_sockptr(&mtu, sizeof(mtu), optval, optlen); if (err) break; if (chan->mode == L2CAP_MODE_EXT_FLOWCTL && sk->sk_state == BT_CONNECTED) err = l2cap_chan_reconfigure(chan, mtu); else chan->imtu = mtu; break; case BT_MODE: if (!enable_ecred) { err = -ENOPROTOOPT; break; } BT_DBG("sk->sk_state %u", sk->sk_state); if (sk->sk_state != BT_BOUND) { err = -EINVAL; break; } if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&mode, sizeof(mode), optval, optlen); if (err) break; BT_DBG("mode %u", mode); err = l2cap_set_mode(chan, mode); if (err) break; BT_DBG("mode 0x%2.2x", chan->mode); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (sk->sk_state != BT_CONNECTED) return -ENOTCONN; hci_sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (err) return err; } lock_sock(sk); err = bt_sock_wait_ready(sk, msg->msg_flags); release_sock(sk); if (err) return err; l2cap_chan_lock(chan); err = l2cap_chan_send(chan, msg, len, &sockc); l2cap_chan_unlock(chan); return err; } static void l2cap_publish_rx_avail(struct l2cap_chan *chan) { struct sock *sk = chan->data; ssize_t avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc); int expected_skbs, skb_overhead; if (avail <= 0) { l2cap_chan_rx_avail(chan, 0); return; } if (!chan->mps) { l2cap_chan_rx_avail(chan, -1); return; } /* Correct available memory by estimated sk_buff overhead. * This is significant due to small transfer sizes. However, accept * at least one full packet if receive space is non-zero. */ expected_skbs = DIV_ROUND_UP(avail, chan->mps); skb_overhead = expected_skbs * sizeof(struct sk_buff); if (skb_overhead < avail) l2cap_chan_rx_avail(chan, avail - skb_overhead); else l2cap_chan_rx_avail(chan, -1); } static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct l2cap_pinfo *pi = l2cap_pi(sk); int err; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, BT_SCM_ERROR); lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { if (pi->chan->mode == L2CAP_MODE_EXT_FLOWCTL) { sk->sk_state = BT_CONNECTED; pi->chan->state = BT_CONNECTED; __l2cap_ecred_conn_rsp_defer(pi->chan); } else if (bdaddr_type_is_le(pi->chan->src_type)) { sk->sk_state = BT_CONNECTED; pi->chan->state = BT_CONNECTED; __l2cap_le_connect_rsp_defer(pi->chan); } else { sk->sk_state = BT_CONFIG; pi->chan->state = BT_CONFIG; __l2cap_connect_rsp_defer(pi->chan); } err = 0; goto done; } release_sock(sk); if (sock->type == SOCK_STREAM) err = bt_sock_stream_recvmsg(sock, msg, len, flags); else err = bt_sock_recvmsg(sock, msg, len, flags); if (pi->chan->mode != L2CAP_MODE_ERTM && pi->chan->mode != L2CAP_MODE_LE_FLOWCTL && pi->chan->mode != L2CAP_MODE_EXT_FLOWCTL) return err; lock_sock(sk); l2cap_publish_rx_avail(pi->chan); /* Attempt to put pending rx data in the socket buffer */ while (!list_empty(&pi->rx_busy)) { struct l2cap_rx_busy *rx_busy = list_first_entry(&pi->rx_busy, struct l2cap_rx_busy, list); if (__sock_queue_rcv_skb(sk, rx_busy->skb) < 0) goto done; list_del(&rx_busy->list); kfree(rx_busy); } /* Restore data flow when half of the receive buffer is * available. This avoids resending large numbers of * frames. */ if (test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1) l2cap_chan_busy(pi->chan, 0); done: release_sock(sk); return err; } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket, with l2cap channel lock. */ static void l2cap_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state)); /* Sock is dead, so set chan data to NULL, avoid other task use invalid * sock pointer. */ l2cap_pi(sk)->chan->data = NULL; /* Kill poor orphan */ l2cap_chan_put(l2cap_pi(sk)->chan); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) { DECLARE_WAITQUEUE(wait, current); int err = 0; int timeo = L2CAP_WAIT_ACK_POLL_PERIOD; /* Timeout to prevent infinite loop */ unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT; add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); do { BT_DBG("Waiting for %d ACKs, timeout %04d ms", chan->unacked_frames, time_after(jiffies, timeout) ? 0 : jiffies_to_msecs(timeout - jiffies)); if (!timeo) timeo = L2CAP_WAIT_ACK_POLL_PERIOD; if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; if (time_after(jiffies, timeout)) { err = -ENOLINK; break; } } while (chan->unacked_frames > 0 && chan->state == BT_CONNECTED); set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } static int l2cap_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; struct l2cap_chan *chan; struct l2cap_conn *conn; int err = 0; BT_DBG("sock %p, sk %p, how %d", sock, sk, how); /* 'how' parameter is mapped to sk_shutdown as follows: * SHUT_RD (0) --> RCV_SHUTDOWN (1) * SHUT_WR (1) --> SEND_SHUTDOWN (2) * SHUT_RDWR (2) --> SHUTDOWN_MASK (3) */ how++; if (!sk) return 0; lock_sock(sk); if ((sk->sk_shutdown & how) == how) goto shutdown_already; BT_DBG("Handling sock shutdown"); /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); /* prevent chan structure from being freed whilst unlocked */ chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); if (!chan) goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); if (chan->mode == L2CAP_MODE_ERTM && chan->unacked_frames > 0 && chan->state == BT_CONNECTED) { err = __l2cap_wait_ack(sk, chan); /* After waiting for ACKs, check whether shutdown * has already been actioned to close the L2CAP * link such as by l2cap_disconnection_req(). */ if ((sk->sk_shutdown & how) == how) goto shutdown_matched; } /* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN * is already set */ if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { sk->sk_shutdown |= RCV_SHUTDOWN; if ((sk->sk_shutdown & how) == how) goto shutdown_matched; } sk->sk_shutdown |= SEND_SHUTDOWN; release_sock(sk); l2cap_chan_lock(chan); /* prevent conn structure from being freed */ conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { mutex_unlock(&conn->lock); l2cap_conn_put(conn); } lock_sock(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); shutdown_matched: l2cap_chan_put(chan); sock_put(sk); shutdown_already: if (!err && sk->sk_err) err = -sk->sk_err; release_sock(sk); BT_DBG("Sock shutdown complete err: %d", err); return err; } static int l2cap_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err; struct l2cap_chan *chan; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; l2cap_sock_cleanup_listen(sk); bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, SHUT_RDWR); chan = l2cap_pi(sk)->chan; l2cap_chan_hold(chan); l2cap_chan_lock(chan); sock_orphan(sk); l2cap_sock_kill(sk); l2cap_chan_unlock(chan); l2cap_chan_put(chan); return err; } static void l2cap_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p state %s", parent, state_to_string(parent->sk_state)); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { struct l2cap_chan *chan = l2cap_pi(sk)->chan; BT_DBG("child chan %p state %s", chan, state_to_string(chan->state)); l2cap_chan_hold(chan); l2cap_chan_lock(chan); __clear_chan_timer(chan); l2cap_chan_close(chan, ECONNRESET); l2cap_sock_kill(sk); l2cap_chan_unlock(chan); l2cap_chan_put(chan); } } static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) { struct sock *sk, *parent = chan->data; lock_sock(parent); /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { BT_DBG("backlog full %d", parent->sk_ack_backlog); release_sock(parent); return NULL; } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return NULL; } bt_sock_reclassify_lock(sk, BTPROTO_L2CAP); l2cap_sock_init(sk, parent); bt_accept_enqueue(parent, sk, false); release_sock(parent); return l2cap_pi(sk)->chan; } static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct sock *sk; struct l2cap_pinfo *pi; int err; sk = chan->data; if (!sk) return -ENXIO; pi = l2cap_pi(sk); lock_sock(sk); if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) { err = -ENOMEM; goto done; } if (chan->mode != L2CAP_MODE_ERTM && chan->mode != L2CAP_MODE_STREAMING && chan->mode != L2CAP_MODE_LE_FLOWCTL && chan->mode != L2CAP_MODE_EXT_FLOWCTL) { /* Even if no filter is attached, we could potentially * get errors from security modules, etc. */ err = sk_filter(sk, skb); if (err) goto done; } err = __sock_queue_rcv_skb(sk, skb); l2cap_publish_rx_avail(chan); /* For ERTM and LE, handle a skb that doesn't fit into the recv * buffer. This is important to do because the data frames * have already been acked, so the skb cannot be discarded. * * Notify the l2cap core that the buffer is full, so the * LOCAL_BUSY state is entered and no more frames are * acked and reassembled until there is buffer space * available. */ if (err < 0 && (chan->mode == L2CAP_MODE_ERTM || chan->mode == L2CAP_MODE_LE_FLOWCTL || chan->mode == L2CAP_MODE_EXT_FLOWCTL)) { struct l2cap_rx_busy *rx_busy = kmalloc(sizeof(*rx_busy), GFP_KERNEL); if (!rx_busy) { err = -ENOMEM; goto done; } rx_busy->skb = skb; list_add_tail(&rx_busy->list, &pi->rx_busy); l2cap_chan_busy(chan, 1); err = 0; } done: release_sock(sk); return err; } static void l2cap_sock_close_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; if (!sk) return; l2cap_sock_kill(sk); } static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) { struct sock *sk = chan->data; struct sock *parent; if (!sk) return; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); /* This callback can be called both for server (BT_LISTEN) * sockets as well as "normal" ones. To avoid lockdep warnings * with child socket locking (through l2cap_sock_cleanup_listen) * we need separation into separate nesting levels. The simplest * way to accomplish this is to inherit the nesting level used * for the channel. */ lock_sock_nested(sk, atomic_read(&chan->nesting)); parent = bt_sk(sk)->parent; switch (chan->state) { case BT_OPEN: case BT_BOUND: case BT_CLOSED: break; case BT_LISTEN: l2cap_sock_cleanup_listen(sk); sk->sk_state = BT_CLOSED; chan->state = BT_CLOSED; break; default: sk->sk_state = BT_CLOSED; chan->state = BT_CLOSED; sk->sk_err = err; if (parent) { bt_accept_unlink(sk); parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } break; } release_sock(sk); /* Only zap after cleanup to avoid use after free race */ sock_set_flag(sk, SOCK_ZAPPED); } static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, int err) { struct sock *sk = chan->data; sk->sk_state = state; if (err) sk->sk_err = err; } static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sock *sk = chan->data; struct sk_buff *skb; int err; l2cap_chan_unlock(chan); skb = bt_skb_send_alloc(sk, hdr_len + len, nb, &err); l2cap_chan_lock(chan); if (!skb) return ERR_PTR(err); /* Channel lock is released before requesting new skb and then * reacquired thus we need to recheck channel state. */ if (chan->state != BT_CONNECTED) { kfree_skb(skb); return ERR_PTR(-ENOTCONN); } skb->priority = READ_ONCE(sk->sk_priority); bt_cb(skb)->l2cap.chan = chan; return skb; } static void l2cap_sock_ready_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; struct sock *parent; lock_sock(sk); parent = bt_sk(sk)->parent; BT_DBG("sk %p, parent %p", sk, parent); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); if (parent) parent->sk_data_ready(parent); release_sock(sk); } static void l2cap_sock_defer_cb(struct l2cap_chan *chan) { struct sock *parent, *sk = chan->data; lock_sock(sk); parent = bt_sk(sk)->parent; if (parent) parent->sk_data_ready(parent); release_sock(sk); } static void l2cap_sock_resume_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) { sk->sk_state = BT_CONNECTED; chan->state = BT_CONNECTED; } clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); sk->sk_state_change(sk); } static void l2cap_sock_set_shutdown_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; release_sock(sk); } static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; return sk->sk_sndtimeo; } static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; return sk->sk_peer_pid; } static void l2cap_sock_suspend_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); sk->sk_state_change(sk); } static int l2cap_sock_filter(struct l2cap_chan *chan, struct sk_buff *skb) { struct sock *sk = chan->data; switch (chan->mode) { case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: return sk_filter(sk, skb); } return 0; } static const struct l2cap_ops l2cap_chan_ops = { .name = "L2CAP Socket Interface", .new_connection = l2cap_sock_new_connection_cb, .recv = l2cap_sock_recv_cb, .close = l2cap_sock_close_cb, .teardown = l2cap_sock_teardown_cb, .state_change = l2cap_sock_state_change_cb, .ready = l2cap_sock_ready_cb, .defer = l2cap_sock_defer_cb, .resume = l2cap_sock_resume_cb, .suspend = l2cap_sock_suspend_cb, .set_shutdown = l2cap_sock_set_shutdown_cb, .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, .get_peer_pid = l2cap_sock_get_peer_pid_cb, .alloc_skb = l2cap_sock_alloc_skb_cb, .filter = l2cap_sock_filter, }; static void l2cap_sock_destruct(struct sock *sk) { struct l2cap_rx_busy *rx_busy, *next; BT_DBG("sk %p", sk); if (l2cap_pi(sk)->chan) { l2cap_pi(sk)->chan->data = NULL; l2cap_chan_put(l2cap_pi(sk)->chan); } list_for_each_entry_safe(rx_busy, next, &l2cap_pi(sk)->rx_busy, list) { kfree_skb(rx_busy->skb); list_del(&rx_busy->list); kfree(rx_busy); } skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name, int *msg_namelen) { DECLARE_SOCKADDR(struct sockaddr_l2 *, la, msg_name); memset(la, 0, sizeof(struct sockaddr_l2)); la->l2_family = AF_BLUETOOTH; la->l2_psm = bt_cb(skb)->l2cap.psm; bacpy(&la->l2_bdaddr, &bt_cb(skb)->l2cap.bdaddr); *msg_namelen = sizeof(struct sockaddr_l2); } static void l2cap_sock_init(struct sock *sk, struct sock *parent) { struct l2cap_chan *chan = l2cap_pi(sk)->chan; BT_DBG("sk %p", sk); if (parent) { struct l2cap_chan *pchan = l2cap_pi(parent)->chan; sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; chan->chan_type = pchan->chan_type; chan->imtu = pchan->imtu; chan->omtu = pchan->omtu; chan->conf_state = pchan->conf_state; chan->mode = pchan->mode; chan->fcs = pchan->fcs; chan->max_tx = pchan->max_tx; chan->tx_win = pchan->tx_win; chan->tx_win_max = pchan->tx_win_max; chan->sec_level = pchan->sec_level; chan->flags = pchan->flags; chan->tx_credits = pchan->tx_credits; chan->rx_credits = pchan->rx_credits; if (chan->chan_type == L2CAP_CHAN_FIXED) { chan->scid = pchan->scid; chan->dcid = pchan->scid; } security_sk_clone(parent, sk); } else { switch (sk->sk_type) { case SOCK_RAW: chan->chan_type = L2CAP_CHAN_RAW; break; case SOCK_DGRAM: chan->chan_type = L2CAP_CHAN_CONN_LESS; bt_sk(sk)->skb_msg_name = l2cap_skb_msg_name; break; case SOCK_SEQPACKET: case SOCK_STREAM: chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; break; } chan->imtu = L2CAP_DEFAULT_MTU; chan->omtu = 0; if (!disable_ertm && sk->sk_type == SOCK_STREAM) { chan->mode = L2CAP_MODE_ERTM; set_bit(CONF_STATE2_DEVICE, &chan->conf_state); } else { chan->mode = L2CAP_MODE_BASIC; } l2cap_chan_set_defaults(chan); } /* Default config options */ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; chan->data = sk; chan->ops = &l2cap_chan_ops; l2cap_publish_rx_avail(chan); } static struct proto l2cap_proto = { .name = "L2CAP", .owner = THIS_MODULE, .obj_size = sizeof(struct l2cap_pinfo) }; static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; struct l2cap_chan *chan; sk = bt_sock_alloc(net, sock, &l2cap_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = l2cap_sock_destruct; sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; INIT_LIST_HEAD(&l2cap_pi(sk)->rx_busy); chan = l2cap_chan_create(); if (!chan) { sk_free(sk); if (sock) sock->sk = NULL; return NULL; } l2cap_chan_hold(chan); l2cap_pi(sk)->chan = chan; return sk; } static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) return -EPERM; sock->ops = &l2cap_sock_ops; sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; l2cap_sock_init(sk, NULL); bt_sock_link(&l2cap_sk_list, sk); return 0; } static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, .bind = l2cap_sock_bind, .connect = l2cap_sock_connect, .listen = l2cap_sock_listen, .accept = l2cap_sock_accept, .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = l2cap_sock_shutdown, .setsockopt = l2cap_sock_setsockopt, .getsockopt = l2cap_sock_getsockopt }; static const struct net_proto_family l2cap_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = l2cap_sock_create, }; int __init l2cap_init_sockets(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_l2) > sizeof(struct sockaddr)); err = proto_register(&l2cap_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops); if (err < 0) { BT_ERR("L2CAP socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "l2cap", &l2cap_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create L2CAP proc file"); bt_sock_unregister(BTPROTO_L2CAP); goto error; } BT_INFO("L2CAP socket layer initialized"); return 0; error: proto_unregister(&l2cap_proto); return err; } void l2cap_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "l2cap"); bt_sock_unregister(BTPROTO_L2CAP); proto_unregister(&l2cap_proto); }
40 1 3 4 1 28 3 25 5 5 1 1 1 1 1 4 1 2 1 7 4 3 2 2 2 2 2 2 58 32 27 1 5 4 2 9 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/tcp.h> struct nft_exthdr { u8 type; u8 offset; u8 len; u8 op; u8 dreg; u8 sreg; u8 flags; }; static unsigned int optlen(const u8 *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0) return 1; else return opt[offset + 1]; } static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) { if (len % NFT_REG32_SIZE) dest[len / NFT_REG32_SIZE] = 0; return skb_copy_bits(skb, offset, dest, len); } static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; unsigned int offset = 0; int err; if (pkt->skb->protocol != htons(ETH_P_IPV6)) goto err; err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } /* find the offset to specified option. * * If target header is found, its offset is set in *offset and return option * number. Otherwise, return negative error. * * If the first fragment doesn't contain the End of Options it is considered * invalid. */ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned int *offset, int target) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; bool found = false; __be32 info; int optlen; iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) return -ENOENT; memset(opt, 0, sizeof(struct ip_options)); /* Copy the options since __ip_options_compile() modifies * the options. */ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; if (__ip_options_compile(net, opt, NULL, &info)) return -EBADMSG; switch (target) { case IPOPT_SSRR: case IPOPT_LSRR: if (!opt->srr) break; found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; *offset = opt->router_alert; found = true; break; default: return -EOPNOTSUPP; } return found ? target : -ENOENT; } static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; struct sk_buff *skb = pkt->skb; unsigned int offset; int err; if (skb->protocol != htons(ETH_P_IP)) goto err; err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) { struct tcphdr *tcph; if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) return NULL; tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; *tcphdr_len = __tcp_hdrlen(tcph); if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, optl, tcphdr_len, offset; u32 *dest = &regs->data[priv->dreg]; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; offset = i + priv->offset; if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, 1); } else { if (priv->len % NFT_REG32_SIZE) dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } return; } err: if (priv->flags & NFT_EXTHDR_F_PRESENT) *dest = 0; else regs->verdict.code = NFT_BREAK; } static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, optl, tcphdr_len, offset; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto err; tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { __be16 v16; __be32 v32; } old, new; optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; offset = i + priv->offset; switch (priv->len) { case 2: old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset)); new.v16 = (__force __be16)nft_reg_load16( &regs->data[priv->sreg]); switch (priv->type) { case TCPOPT_MSS: /* increase can cause connection to stall */ if (ntohs(old.v16) <= ntohs(new.v16)) return; break; } if (old.v16 == new.v16) return; put_unaligned(new.v16, (__be16*)(opt + offset)); inet_proto_csum_replace2(&tcph->check, pkt->skb, old.v16, new.v16, false); break; case 4: new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]); old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset)); if (old.v32 == new.v32) return; put_unaligned(new.v32, (__be32*)(opt + offset)); inet_proto_csum_replace4(&tcph->check, pkt->skb, old.v32, new.v32, false); break; default: WARN_ON_ONCE(1); break; } return; } return; err: regs->verdict.code = NFT_BREAK; } static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, tcphdr_len, optl; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto drop; tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { unsigned int j; optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len) goto drop; for (j = 0; j < optl; ++j) { u16 n = TCPOPT_NOP; u16 o = opt[i+j]; if ((i + j) % 2 == 0) { o <<= 8; n <<= 8; } inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o), htons(n), false); } memset(opt + i, TCPOPT_NOP, optl); return; } /* option not found, continue. This allows to do multiple * option removals per rule. */ return; err: regs->verdict.code = NFT_BREAK; return; drop: /* can't remove, no choice but to drop */ regs->verdict.code = NF_DROP; } static void nft_exthdr_sctp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; if (pkt->tprot != IPPROTO_SCTP) goto err; do { sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); if (!sch || !sch->length) break; if (sch->type == priv->type) { if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, true); return; } if (priv->offset + priv->len > ntohs(sch->length) || offset + ntohs(sch->length) > pkt->skb->len) break; if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, dest, priv->len) < 0) break; return; } offset += SCTP_PAD4(ntohs(sch->length)); } while (offset < pkt->skb->len); err: if (priv->flags & NFT_EXTHDR_F_PRESENT) nft_reg_store8(dest, false); else regs->verdict.code = NFT_BREAK; } static void nft_exthdr_dccp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int thoff, dataoff, optoff, optlen, i; u32 *dest = &regs->data[priv->dreg]; const struct dccp_hdr *dh; struct dccp_hdr _dh; if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) goto err; thoff = nft_thoff(pkt); dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); if (!dh) goto err; dataoff = dh->dccph_doff * sizeof(u32); optoff = __dccp_hdr_len(dh); if (dataoff <= optoff) goto err; optlen = dataoff - optoff; for (i = 0; i < optlen; ) { /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in * the length; the remaining options are at least 2B long. In * all cases, the first byte contains the option type. In * multi-byte options, the second byte contains the option * length, which must be at least two: 1 for the type plus 1 for * the length plus 0-253 for any following option data. We * aren't interested in the option data, only the type and the * length, so we don't need to read more than two bytes at a * time. */ unsigned int buflen = optlen - i; u8 buf[2], *bufp; u8 type, len; if (buflen > sizeof(buf)) buflen = sizeof(buf); bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, &buf); if (!bufp) goto err; type = bufp[0]; if (type == priv->type) { nft_reg_store8(dest, 1); return; } if (type <= DCCPO_MAX_RESERVED) { i++; continue; } if (buflen < 2) goto err; len = bufp[1]; if (len < 2) goto err; i += len; } err: *dest = 0; } static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; if (!tb[NFTA_EXTHDR_DREG] || !tb[NFTA_EXTHDR_TYPE] || !tb[NFTA_EXTHDR_OFFSET] || !tb[NFTA_EXTHDR_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); if (err < 0) return err; err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); if (err < 0) return err; if (tb[NFTA_EXTHDR_FLAGS]) { err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags); if (err < 0) return err; if (flags & ~NFT_EXTHDR_F_PRESENT) return -EINVAL; } if (tb[NFTA_EXTHDR_OP]) { err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); if (err < 0) return err; } priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->flags = flags; priv->op = op; return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; if (!tb[NFTA_EXTHDR_SREG] || !tb[NFTA_EXTHDR_TYPE] || !tb[NFTA_EXTHDR_OFFSET] || !tb[NFTA_EXTHDR_LEN]) return -EINVAL; if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); if (err < 0) return err; err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); if (err < 0) return err; if (offset < 2) return -EOPNOTSUPP; switch (len) { case 2: break; case 4: break; default: return -EOPNOTSUPP; } err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); if (err < 0) return err; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->flags = flags; priv->op = op; return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, priv->len); } static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); if (tb[NFTA_EXTHDR_SREG] || tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS] || tb[NFTA_EXTHDR_OFFSET] || tb[NFTA_EXTHDR_LEN]) return -EINVAL; if (!tb[NFTA_EXTHDR_TYPE]) return -EINVAL; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->op = NFT_EXTHDR_OP_TCPOPT; return 0; } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); int err = nft_exthdr_init(ctx, expr, tb); if (err < 0) return err; switch (priv->type) { case IPOPT_SSRR: case IPOPT_LSRR: case IPOPT_RR: case IPOPT_RA: break; default: return -EOPNOTSUPP; } return 0; } static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); int err = nft_exthdr_init(ctx, expr, tb); if (err < 0) return err; if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) return -EOPNOTSUPP; return 0; } static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg)) return -1; return nft_exthdr_dump_common(skb, priv); } static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg)) return -1; return nft_exthdr_dump_common(skb, priv); } static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); return nft_exthdr_dump_common(skb, priv); } static bool nft_exthdr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_exthdr *priv = nft_expr_priv(expr); const struct nft_exthdr *exthdr; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } exthdr = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->type != exthdr->type || priv->op != exthdr->op || priv->flags != exthdr->flags || priv->offset != exthdr->offset || priv->len != exthdr->len) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_ipv4_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv4_eval, .init = nft_exthdr_ipv4_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_set_eval, .init = nft_exthdr_tcp_set_init, .dump = nft_exthdr_dump_set, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_strip_eval, .init = nft_exthdr_tcp_strip_init, .dump = nft_exthdr_dump_strip, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_sctp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_sctp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_dccp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_dccp_eval, .init = nft_exthdr_dccp_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 op; if (!tb[NFTA_EXTHDR_OP]) return &nft_exthdr_ipv6_ops; if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG]) return ERR_PTR(-EOPNOTSUPP); op = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OP])); switch (op) { case NFT_EXTHDR_OP_TCPOPT: if (tb[NFTA_EXTHDR_SREG]) return &nft_exthdr_tcp_set_ops; if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_tcp_ops; return &nft_exthdr_tcp_strip_ops; case NFT_EXTHDR_OP_IPV6: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; case NFT_EXTHDR_OP_IPV4: if (ctx->family != NFPROTO_IPV6) { if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv4_ops; } break; case NFT_EXTHDR_OP_SCTP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; case NFT_EXTHDR_OP_DCCP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_dccp_ops; break; } return ERR_PTR(-EOPNOTSUPP); } struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", .select_ops = nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, };
63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 63 62 63 63 63 63 63 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Fredy Neeser */ /* Greg Joyce <greg@opengridcomputing.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> #include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include "siw.h" #include "siw_cm.h" /* * Set to any combination of * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR */ static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; static const bool relaxed_ird_negotiation = true; static void siw_cm_llp_state_change(struct sock *s); static void siw_cm_llp_data_ready(struct sock *s); static void siw_cm_llp_write_space(struct sock *s); static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); write_lock_bh(&sk->sk_callback_lock); cep->sk_state_change = sk->sk_state_change; cep->sk_data_ready = sk->sk_data_ready; cep->sk_write_space = sk->sk_write_space; cep->sk_error_report = sk->sk_error_report; sk->sk_state_change = siw_cm_llp_state_change; sk->sk_data_ready = siw_cm_llp_data_ready; sk->sk_write_space = siw_cm_llp_write_space; sk->sk_error_report = siw_cm_llp_error_report; write_unlock_bh(&sk->sk_callback_lock); } static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) { sk->sk_state_change = cep->sk_state_change; sk->sk_data_ready = cep->sk_data_ready; sk->sk_write_space = cep->sk_write_space; sk->sk_error_report = cep->sk_error_report; sk->sk_user_data = NULL; } static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) { struct socket *s = cep->sock; struct sock *sk = s->sk; write_lock_bh(&sk->sk_callback_lock); qp->attrs.sk = s; sk->sk_data_ready = siw_qp_llp_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_socket_disassoc(struct socket *s) { struct sock *sk = s->sk; struct siw_cep *cep; if (sk) { write_lock_bh(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (cep) { siw_sk_restore_upcalls(sk, cep); siw_cep_put(cep); } else { pr_warn("siw: cannot restore sk callbacks: no ep\n"); } write_unlock_bh(&sk->sk_callback_lock); } else { pr_warn("siw: cannot restore sk callbacks: no sk\n"); } } static void siw_rtr_data_ready(struct sock *sk) { struct siw_cep *cep; struct siw_qp *qp = NULL; read_descriptor_t rd_desc; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { WARN(1, "No connection endpoint\n"); goto out; } qp = sk_to_qp(sk); memset(&rd_desc, 0, sizeof(rd_desc)); rd_desc.arg.data = qp; rd_desc.count = 1; tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop. */ if (!qp->rx_stream.rx_suspend) siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); out: read_unlock(&sk->sk_callback_lock); if (qp) siw_qp_socket_assoc(cep, qp); } static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) { struct sock *sk = cep->sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = siw_rtr_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) { cep->sock = s; siw_cep_get(cep); s->sk->sk_user_data = cep; siw_sk_assign_cm_upcalls(s->sk); } static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) { struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); unsigned long flags; if (!cep) return NULL; INIT_LIST_HEAD(&cep->listenq); INIT_LIST_HEAD(&cep->devq); INIT_LIST_HEAD(&cep->work_freelist); kref_init(&cep->ref); cep->state = SIW_EPSTATE_IDLE; init_waitqueue_head(&cep->waitq); spin_lock_init(&cep->lock); cep->sdev = sdev; cep->enhanced_rdma_conn_est = false; spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&cep->devq, &sdev->cep_list); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "new endpoint\n"); return cep; } static void siw_cm_free_work(struct siw_cep *cep) { struct list_head *w, *tmp; struct siw_cm_work *work; list_for_each_safe(w, tmp, &cep->work_freelist) { work = list_entry(w, struct siw_cm_work, list); list_del(&work->list); kfree(work); } } static void siw_cancel_mpatimer(struct siw_cep *cep) { spin_lock_bh(&cep->lock); if (cep->mpa_timer) { if (cancel_delayed_work(&cep->mpa_timer->work)) { siw_cep_put(cep); kfree(cep->mpa_timer); /* not needed again */ } cep->mpa_timer = NULL; } spin_unlock_bh(&cep->lock); } static void siw_put_work(struct siw_cm_work *work) { INIT_LIST_HEAD(&work->list); spin_lock_bh(&work->cep->lock); list_add(&work->list, &work->cep->work_freelist); spin_unlock_bh(&work->cep->lock); } static void siw_cep_set_inuse(struct siw_cep *cep) { unsigned long flags; retry: spin_lock_irqsave(&cep->lock, flags); if (cep->in_use) { spin_unlock_irqrestore(&cep->lock, flags); wait_event_interruptible(cep->waitq, !cep->in_use); if (signal_pending(current)) flush_signals(current); goto retry; } else { cep->in_use = 1; spin_unlock_irqrestore(&cep->lock, flags); } } static void siw_cep_set_free(struct siw_cep *cep) { unsigned long flags; spin_lock_irqsave(&cep->lock, flags); cep->in_use = 0; spin_unlock_irqrestore(&cep->lock, flags); wake_up(&cep->waitq); } static void __siw_cep_dealloc(struct kref *ref) { struct siw_cep *cep = container_of(ref, struct siw_cep, ref); struct siw_device *sdev = cep->sdev; unsigned long flags; WARN_ON(cep->listen_cep); /* kfree(NULL) is safe */ kfree(cep->mpa.pdata); spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) siw_cm_free_work(cep); spin_unlock_bh(&cep->lock); spin_lock_irqsave(&sdev->lock, flags); list_del(&cep->devq); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "free endpoint\n"); kfree(cep); } static struct siw_cm_work *siw_get_work(struct siw_cep *cep) { struct siw_cm_work *work = NULL; spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) { work = list_entry(cep->work_freelist.next, struct siw_cm_work, list); list_del_init(&work->list); } spin_unlock_bh(&cep->lock); return work; } static int siw_cm_alloc_work(struct siw_cep *cep, int num) { struct siw_cm_work *work; while (num--) { work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) { if (!(list_empty(&cep->work_freelist))) siw_cm_free_work(cep); return -ENOMEM; } work->cep = cep; INIT_LIST_HEAD(&work->list); list_add(&work->list, &cep->work_freelist); } return 0; } /* * siw_cm_upcall() * * Upcall to IWCM to inform about async connection events */ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status) { struct iw_cm_event event; struct iw_cm_id *id; memset(&event, 0, sizeof(event)); event.status = status; event.event = reason; if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.provider_data = cep; id = cep->listen_cep->cm_id; } else { id = cep->cm_id; } /* Signal IRD and ORD */ if (reason == IW_CM_EVENT_ESTABLISHED || reason == IW_CM_EVENT_CONNECT_REPLY) { /* Signal negotiated IRD/ORD values we will use */ event.ird = cep->ird; event.ord = cep->ord; } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.ird = cep->ord; event.ord = cep->ird; } /* Signal private data and address information */ if (reason == IW_CM_EVENT_CONNECT_REQUEST || reason == IW_CM_EVENT_CONNECT_REPLY) { u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); if (pd_len) { /* * hand over MPA private data */ event.private_data_len = pd_len; event.private_data = cep->mpa.pdata; /* Hide MPA V2 IRD/ORD control */ if (cep->enhanced_rdma_conn_est) { event.private_data_len -= sizeof(struct mpa_v2_data); event.private_data += sizeof(struct mpa_v2_data); } } getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } static void siw_free_cm_id(struct siw_cep *cep) { if (!cep->cm_id) return; cep->cm_id->rem_ref(cep->cm_id); cep->cm_id = NULL; } static void siw_destroy_cep_sock(struct siw_cep *cep) { if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } } /* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ). */ void siw_qp_cm_drop(struct siw_qp *qp, int schedule) { struct siw_cep *cep = qp->cep; qp->rx_stream.rx_suspend = 1; qp->tx_ctx.tx_suspend = 1; if (!qp->cep) return; if (schedule) { siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); } else { siw_cep_set_inuse(cep); if (cep->state == SIW_EPSTATE_CLOSED) { siw_dbg_cep(cep, "already closed\n"); goto out; } siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); siw_send_terminate(qp); if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); break; case SIW_EPSTATE_RDMA_MODE: siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break; case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break; } siw_free_cm_id(cep); siw_cep_put(cep); } cep->state = SIW_EPSTATE_CLOSED; siw_destroy_cep_sock(cep); if (cep->qp) { cep->qp = NULL; siw_qp_put(qp); } out: siw_cep_set_free(cep); } } void siw_cep_put(struct siw_cep *cep) { WARN_ON(kref_read(&cep->ref) < 1); kref_put(&cep->ref, __siw_cep_dealloc); } static void siw_cep_set_free_and_put(struct siw_cep *cep) { siw_cep_set_free(cep); siw_cep_put(cep); } void siw_cep_get(struct siw_cep *cep) { kref_get(&cep->ref); } /* * Expects params->pd_len in host byte order */ static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) { struct socket *s = cep->sock; struct mpa_rr *rr = &cep->mpa.hdr; struct kvec iov[3]; struct msghdr msg; int rv; int iovec_num = 0; int mpa_len; memset(&msg, 0, sizeof(msg)); iov[iovec_num].iov_base = rr; iov[iovec_num].iov_len = sizeof(*rr); mpa_len = sizeof(*rr); if (cep->enhanced_rdma_conn_est) { iovec_num++; iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); mpa_len += sizeof(cep->mpa.v2_ctrl); } if (pd_len) { iovec_num++; iov[iovec_num].iov_base = (char *)pdata; iov[iovec_num].iov_len = pd_len; mpa_len += pd_len; } if (cep->enhanced_rdma_conn_est) pd_len += sizeof(cep->mpa.v2_ctrl); rr->params.pd_len = cpu_to_be16(pd_len); rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); return rv < 0 ? rv : 0; } /* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only */ static int siw_recv_mpa_rr(struct siw_cep *cep) { struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock; u16 pd_len; int rcvd, to_rcv; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 0); if (rcvd <= 0) return -ECONNABORTED; cep->mpa.bytes_rcvd += rcvd; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN; if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO; } pd_len = be16_to_cpu(hdr->params.pd_len); /* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data. */ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation. */ u32 word; rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); if (rcvd == -EAGAIN) return 0; if (rcvd == 0) { siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE; } if (rcvd < 0) { siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd; } siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); return -EPROTO; } /* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0. */ if (!cep->mpa.pdata) { cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM; } rcvd = ksock_recv( s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT); if (rcvd < 0) return rcvd; if (rcvd > to_rcv) return -EPROTO; cep->mpa.bytes_rcvd += rcvd; if (to_rcv == rcvd) { siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0; } return -EAGAIN; } /* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP. */ static int siw_proc_mpareq(struct siw_cep *cep) { struct mpa_rr *req; int version, rv; u16 pd_len; rv = siw_recv_mpa_rr(cep); if (rv) return rv; req = &cep->mpa.hdr; version = __mpa_rr_revision(req->params.bits); pd_len = be16_to_cpu(req->params.pd_len); if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO; if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO; /* Prepare for sending MPA reply */ memcpy(req->key, MPA_KEY_REP, 16); if (version == MPA_REVISION_2 && (req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set. */ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn; cep->enhanced_rdma_conn_est = true; } /* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn; if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter. */ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn; /* Enable CRC if requested by module parameter */ if (mpa_crc_required) req->params.bits |= MPA_RR_FLAG_CRC; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; /* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values. */ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; cep->ord = min(cep->ord, SIW_MAX_ORD_QP); cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; cep->ird = min(cep->ird, SIW_MAX_IRD_QP); /* May get overwritten by locally negotiated values */ cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); /* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated. */ if (v2->ird & MPA_V2_PEER_TO_PEER) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; if (v2->ord & MPA_V2_RDMA_WRITE_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; else if (v2->ord & MPA_V2_RDMA_READ_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; else cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; } } cep->state = SIW_EPSTATE_RECVD_MPAREQ; /* Keep reference until IWCM accepts/rejects */ siw_cep_get(cep); rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); if (rv) siw_cep_put(cep); return rv; reject_conn: siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); req->params.bits &= ~MPA_RR_FLAG_MARKERS; req->params.bits |= MPA_RR_FLAG_REJECT; if (!mpa_crc_required && mpa_crc_strict) req->params.bits &= ~MPA_RR_FLAG_CRC; if (pd_len) kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; siw_send_mpareqrep(cep, NULL, 0); return -EOPNOTSUPP; } static int siw_proc_mpareply(struct siw_cep *cep) { struct siw_qp_attrs qp_attrs; enum siw_qp_attr_mask qp_attr_mask; struct siw_qp *qp = cep->qp; struct mpa_rr *rep; int rv; u16 rep_ord; u16 rep_ird; bool ird_insufficient = false; enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); if (rv) goto out_err; siw_cancel_mpatimer(cep); rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { /* allow for 0, 1, and 2 only */ rv = -EPROTO; goto out_err; } if (memcmp(rep->key, MPA_KEY_REP, 16)) { siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INVALID_REQ_RESP, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } if (rep->params.bits & MPA_RR_FLAG_REJECT) { siw_dbg_cep(cep, "got mpa reject\n"); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -ECONNRESET; } if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || (mpa_crc_strict && !mpa_crc_required && (rep->params.bits & MPA_RR_FLAG_CRC))) { siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); return -EINVAL; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2; if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. */ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", __mpa_rr_revision(rep->params.bits), rep->params.bits & MPA_RR_FLAG_ENHANCED ? 1 : 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -EINVAL; } v2 = (struct mpa_v2_data *)cep->mpa.pdata; rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; if (cep->ird < rep_ord && (relaxed_ird_negotiation == false || rep_ord > cep->sdev->attrs.max_ird)) { siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", cep->ird, rep_ord, cep->sdev->attrs.max_ord); ird_insufficient = true; } if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, rep_ird); ird_insufficient = true; } /* * Always report negotiated peer values to user, * even if IRD/ORD negotiation failed */ cep->ird = rep_ord; cep->ord = rep_ird; if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM. */ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INSUFFICIENT_IRD, 0); siw_send_terminate(qp); rv = -ENOMEM; goto out_err; } if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) mpa_p2p_mode = cep->mpa.v2_ctrl_req.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); /* * Check if we requested P2P mode, and if peer agrees */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support. */ siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n", mpa_p2p_mode, v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_NO_MATCHING_RTR, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); } } memset(&qp_attrs, 0, sizeof(qp_attrs)); if (rep->params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.irq_size = cep->ird; qp_attrs.orq_size = cep->ord; qp_attrs.sk = cep->sock; qp_attrs.state = SIW_QP_STATE_RTS; qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; /* Move socket RX/TX under QP control */ down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) { rv = -EINVAL; up_write(&qp->state_lock); goto out_err; } rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); siw_qp_socket_assoc(cep, qp); up_write(&qp->state_lock); /* Send extra RDMA frame to trigger peer RTS if negotiated */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); if (rv) goto out_err; } if (!rv) { rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); if (!rv) cep->state = SIW_EPSTATE_RDMA_MODE; return 0; } out_err: if (rv != -EAGAIN) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } /* * siw_accept_newconn - accept an incoming pending connection * */ static void siw_accept_newconn(struct siw_cep *cep) { struct socket *s = cep->sock; struct socket *new_s = NULL; struct siw_cep *new_cep = NULL; int rv = 0; /* debug only. should disappear */ if (cep->state != SIW_EPSTATE_LISTENING) goto error; new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error; /* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP */ new_cep->sk_state_change = cep->sk_state_change; new_cep->sk_data_ready = cep->sk_data_ready; new_cep->sk_write_space = cep->sk_write_space; new_cep->sk_error_report = cep->sk_error_report; rv = kernel_accept(s, &new_s, O_NONBLOCK); if (rv != 0) { /* * Connection already aborted by peer..? */ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); goto error; } new_cep->sock = new_s; siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; if (siw_tcp_nagle == false) tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. */ new_cep->listen_cep = cep; siw_cep_get(cep); if (atomic_read(&new_s->sk->sk_rmem_alloc)) { /* * MPA REQ already queued */ siw_dbg_cep(cep, "immediate mpa request\n"); siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } } siw_cep_set_free(new_cep); } return; error: if (new_cep) siw_cep_put(new_cep); if (new_s) { siw_socket_disassoc(new_s); sock_release(new_s); new_cep->sock = NULL; } siw_dbg_cep(cep, "error %d\n", rv); } static void siw_cm_work_handler(struct work_struct *w) { struct siw_cm_work *work; struct siw_cep *cep; int release_cep = 0, rv = 0; work = container_of(w, struct siw_cm_work, work.work); cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, work->type, cep->state); siw_cep_set_inuse(cep); switch (work->type) { case SIW_CM_WORK_ACCEPT: siw_accept_newconn(cep); break; case SIW_CM_WORK_READ_MPAHDR: if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { if (cep->listen_cep) { siw_cep_set_inuse(cep->listen_cep); if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) rv = siw_proc_mpareq(cep); else rv = -EFAULT; siw_cep_set_free(cep->listen_cep); if (rv != -EAGAIN) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; if (rv) siw_cep_put(cep); } } } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { rv = siw_proc_mpareply(cep); } else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet. */ if (cep->state == SIW_EPSTATE_RDMA_MODE) { cep->sock->sk->sk_data_ready(cep->sock->sk); siw_dbg_cep(cep, "already in RDMA mode"); } else { siw_dbg_cep(cep, "out of state: %d\n", cep->state); } } if (rv && rv != -EAGAIN) release_cep = 1; break; case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close */ if (cep->qp) siw_send_terminate(cep->qp); if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); release_cep = 1; break; case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop */ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING. */ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); } /* * for other states there is no connection * known to the IWCM. */ } else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject */ siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n"); } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received. */ if (cep->listen_cep) { siw_dbg_cep(cep, "no mpareq: drop listener\n"); siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } } } release_cep = 1; break; case SIW_CM_WORK_MPATIMEOUT: cep->mpa_timer = NULL; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout */ cep->mpa.hdr.params.pd_len = 0; if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ETIMEDOUT); release_cep = 1; } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup. */ if (cep->listen_cep) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } release_cep = 1; } break; default: WARN(1, "Undefined CM work type: %d\n", work->type); } if (release_cep) { siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); cep->state = SIW_EPSTATE_CLOSED; if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop() */ siw_qp_get(qp); siw_cep_set_free(cep); siw_qp_llp_close(qp); siw_qp_put(qp); siw_cep_set_inuse(cep); cep->qp = NULL; siw_qp_put(qp); } if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } if (cep->cm_id) { siw_free_cm_id(cep); siw_cep_put(cep); } } siw_cep_set_free(cep); siw_put_work(work); siw_cep_put(cep); } static struct workqueue_struct *siw_cm_wq; int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) { struct siw_cm_work *work = siw_get_work(cep); unsigned long delay = 0; if (!work) { siw_dbg_cep(cep, "failed with no work available\n"); return -ENOMEM; } work->type = type; work->cep = cep; siw_cep_get(cep); INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); if (type == SIW_CM_WORK_MPATIMEOUT) { cep->mpa_timer = work; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) delay = MPAREQ_TIMEOUT; else delay = MPAREP_TIMEOUT; } siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); return 0; } static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) goto out; siw_dbg_cep(cep, "cep state: %d, socket state %d\n", cep->state, sk->sk_state); if (sk->sk_state != TCP_ESTABLISHED) goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: case SIW_EPSTATE_LISTENING: break; case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); break; default: siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); break; } out: read_unlock(&sk->sk_callback_lock); } static void siw_cm_llp_write_space(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) siw_dbg_cep(cep, "state: %d\n", cep->state); } static void siw_cm_llp_error_report(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) { siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", sk->sk_err, sk->sk_state, cep->state); cep->sk_error_report(sk); } } static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { /* endpoint already disassociated */ read_unlock(&sk->sk_callback_lock); return; } orig_state_change = cep->sk_state_change; siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible */ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break; case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp) cep->qp->tx_ctx.tx_suspend = 1; siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break; default: siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); } read_unlock(&sk->sk_callback_lock); orig_state_change(sk); } static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr, bool afonly) { int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ sock_set_reuseaddr(s->sk); if (afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) return rv; } rv = s->ops->bind(s, laddr, size); if (rv < 0) return rv; rv = s->ops->connect(s, raddr, size, flags); return rv < 0 ? rv : 0; } int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_qp *qp; struct siw_cep *cep = NULL; struct socket *s = NULL; struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, *raddr = (struct sockaddr *)&id->remote_addr; bool p2p_mode = peer_to_peer, v4 = true; u16 pd_len = params->private_data_len; int version = mpa_version, rv; if (pd_len > MPA_MAX_PRIVDATA) return -EINVAL; if (params->ird > sdev->attrs.max_ird || params->ord > sdev->attrs.max_ord) return -ENOMEM; if (laddr->sa_family == AF_INET6) v4 = false; else if (laddr->sa_family != AF_INET) return -EAFNOSUPPORT; /* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack. */ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || to_sockaddr_in6(id->remote_addr).sin6_port != 0) raddr = (struct sockaddr *)&id->m_remote_addr; qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %u] does not exist\n", params->qpn); rv = -EINVAL; goto error; } siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, raddr); rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; /* * NOTE: For simplification, connect() is called in blocking * mode. Might be reconsidered for async connection setup at * TCP level. */ rv = kernel_bindconnect(s, laddr, raddr, id->afonly); if (rv != 0) { siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } if (siw_tcp_nagle == false) tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_set_inuse(cep); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; id->add_ref(id); cep->cm_id = id; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ rv = siw_cm_alloc_work(cep, 4); if (rv != 0) { rv = -ENOMEM; goto error; } cep->ird = params->ird; cep->ord = params->ord; if (p2p_mode && cep->ord == 0) cep->ord = 1; cep->state = SIW_EPSTATE_CONNECTING; /* * Associate CEP with socket */ siw_cep_socket_assoc(cep, s); cep->state = SIW_EPSTATE_AWAIT_MPAREP; /* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. */ cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) { pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); version = MPA_REVISION_2; /* Adjust also module parameter */ mpa_version = MPA_REVISION_2; } __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); if (try_gso) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; if (mpa_crc_required) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; /* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'. */ if (version == MPA_REVISION_2) { cep->enhanced_rdma_conn_est = true; cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); if (p2p_mode) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; cep->mpa.v2_ctrl.ord |= rtr_type; } /* Remember own P2P mode requested */ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; } memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); rv = siw_send_mpareqrep(cep, params->private_data, pd_len); /* * Reset private data. */ cep->mpa.hdr.params.pd_len = 0; if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } } error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); sock_release(s); cep->sock = NULL; cep->qp = NULL; cep->cm_id = NULL; id->rem_ref(id); qp->cep = NULL; siw_cep_put(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } else if (s) { sock_release(s); } if (qp) siw_qp_put(qp); return rv; } /* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready. */ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false; siw_cep_set_inuse(cep); siw_cep_put(cep); /* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) { cep->mpa.hdr.params.pd_len = 0; kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; } siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); rv = -ECONNRESET; goto free_cep; } qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep; } down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock; siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if (params->ord > sdev->attrs.max_ord || params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, "[QP %u]: ord %d (max %d), ird %d (max %d)\n", qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); goto error_unlock; } if (cep->enhanced_rdma_conn_est) max_priv_data -= sizeof(struct mpa_v2_data); if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, "[QP %u]: private data length: %d (max %d)\n", qp_id(qp), params->private_data_len, max_priv_data); goto error_unlock; } if (cep->enhanced_rdma_conn_est) { if (params->ord > cep->ord) { if (relaxed_ird_negotiation) { params->ord = cep->ord; } else { cep->ird = params->ird; cep->ord = params->ord; goto error_unlock; } } if (params->ird < cep->ird) { if (relaxed_ird_negotiation && cep->ird <= sdev->attrs.max_ird) params->ird = cep->ird; else { rv = -ENOMEM; goto error_unlock; } } if (cep->mpa.v2_ctrl.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) wait_for_peer_rts = true; /* * Signal back negotiated IRD and ORD values */ cep->mpa.v2_ctrl.ord = htons(params->ord & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); cep->mpa.v2_ctrl.ird = htons(params->ird & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); } cep->ird = params->ird; cep->ord = params->ord; cep->cm_id = id; id->add_ref(id); memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.orq_size = cep->ord; qp_attrs.irq_size = cep->ird; qp_attrs.sk = cep->sock; if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; cep->state = SIW_EPSTATE_RDMA_MODE; /* Move socket RX/TX under QP control */ rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA); up_write(&qp->state_lock); if (rv) goto error; siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); if (rv != 0) goto error; if (wait_for_peer_rts) { siw_sk_assign_rtr_upcalls(cep); } else { siw_qp_socket_assoc(cep, qp); rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); if (rv) goto error; } siw_cep_set_free(cep); return 0; error_unlock: up_write(&qp->state_lock); error: siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_free_cm_id(cep); if (qp->cep) { siw_cep_put(cep); qp->cep = NULL; } cep->qp = NULL; siw_qp_put(qp); free_cep: siw_cep_set_free_and_put(cep); return rv; } /* * siw_reject() * * Local connection reject case. Send private data back to peer, * close connection and dereference connection id. */ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) { struct siw_cep *cep = (struct siw_cep *)id->provider_data; siw_cep_set_inuse(cep); siw_cep_put(cep); siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free_and_put(cep); /* put last reference */ return -ECONNRESET; } siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ siw_send_mpareqrep(cep, pdata, pd_len); } siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); return 0; } /* * siw_create_listen - Create resources for a listener's IWCM ID @id * * Starts listen on the socket address id->local_addr. * */ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; /* * Allow binding local port when still in TIME_WAIT from last close. */ sock_set_reuseaddr(s->sk); if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); if (id->afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) { siw_dbg(id->device, "ip6_sock_set_v6only erro: %d\n", rv); goto error; } } /* For wildcard addr, limit binding to current device only */ if (ipv6_addr_any(&laddr->sin6_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_socket_assoc(cep, s); rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; id->add_ref(id); /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. * */ if (!id->provider_data) { id->provider_data = kmalloc(sizeof(struct list_head), GFP_KERNEL); if (!id->provider_data) { rv = -ENOMEM; goto error; } INIT_LIST_HEAD((struct list_head *)id->provider_data); } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); return 0; error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_cep_set_inuse(cep); siw_free_cm_id(cep); cep->sock = NULL; siw_socket_disassoc(s); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } sock_release(s); dev_put(ndev); return rv; } static void siw_drop_listeners(struct iw_cm_id *id) { struct list_head *p, *tmp; /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. */ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); list_del(p); siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); siw_free_cm_id(cep); if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } } int siw_destroy_listen(struct iw_cm_id *id) { if (!id->provider_data) { siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); kfree(id->provider_data); id->provider_data = NULL; return 0; } int siw_cm_init(void) { /* * create_single_workqueue for strict ordering */ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); if (!siw_cm_wq) return -ENOMEM; return 0; } void siw_cm_exit(void) { if (siw_cm_wq) destroy_workqueue(siw_cm_wq); }
7 8 1 5 1 1 6 6 4 3 21 21 21 62 1 12 37 17 2 16 7 3 4 3 65 2 1 39 23 4 2 2 1 1 2 2 3 1 10 4 6 7 8 1 7 3 13 1 2 4 4 6 10 4 8 2 8 6 4 5 5 4 3 2 1 1 17 17 2 1 3 10 10 1 9 9 9 2 7 6 1 9 17 1 1 15 14 632 631 2 1 1 117 2 2 17 2 11 5 4 1 2 1 8 2 21 4 1 4 65 5 5 56 76 1 8 3 1 12 10 6 1 26 1 23 2 34 1 5 5 1 7 6 1 3 1 59 1 2 17 1 17 633 2 2 632 629 3 86 1 1 1 1 659 1 90 5 635 623 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 // SPDX-License-Identifier: GPL-2.0 /* * Code related to the io_uring_register() syscall * * Copyright (C) 2023 Jens Axboe */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/refcount.h> #include <linux/bits.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include "io_uring.h" #include "opdef.h" #include "tctx.h" #include "rsrc.h" #include "sqpoll.h" #include "register.h" #include "cancel.h" #include "kbuf.h" #include "napi.h" #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_probe *p; size_t size; int i, ret; if (nr_args > IORING_OP_LAST) nr_args = IORING_OP_LAST; size = struct_size(p, ops, nr_args); p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; ret = -EFAULT; if (copy_from_user(p, arg, size)) goto out; ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; p->last_op = IORING_OP_LAST - 1; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; ret = 0; if (copy_to_user(arg, p, size)) ret = -EFAULT; out: kfree(p); return ret; } int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; } return -EINVAL; } static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; u32 id; int ret; creds = get_current_cred(); ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); if (ret < 0) { put_cred(creds); return ret; } return id; } static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; size = array_size(nr_args, sizeof(*res)); if (size == SIZE_MAX) return -EOVERFLOW; res = memdup_user(arg, size); if (IS_ERR(res)) return PTR_ERR(res); ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: if (res[i].register_op >= IORING_REGISTER_LAST) goto err; __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) goto err; __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: restrictions->sqe_flags_required = res[i].sqe_flags; break; default: goto err; } } ret = 0; err: kfree(res); return ret; } static __cold int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args) { int ret; /* Restrictions allowed only if rings started disabled */ if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; /* We allow only a single restrictions registration */ if (ctx->restrictions.registered) return -EBUSY; ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; return ret; } static int io_register_enable_rings(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); /* * Lazy activation attempts would fail if it was polled before * submitter_task is set. */ if (wq_has_sleeper(&ctx->poll_wq)) io_activate_pollwq(ctx); } if (ctx->restrictions.registered) ctx->restricted = 1; ctx->flags &= ~IORING_SETUP_R_DISABLED; if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); return 0; } static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, cpumask_var_t new_mask) { int ret; if (!(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_wq_cpu_affinity(current->io_uring, new_mask); } else { mutex_unlock(&ctx->uring_lock); ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); mutex_lock(&ctx->uring_lock); } return ret; } static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { cpumask_var_t new_mask; int ret; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; cpumask_clear(new_mask); if (len > cpumask_size()) len = cpumask_size(); #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = compat_get_bitmap(cpumask_bits(new_mask), (const compat_ulong_t __user *)arg, len * 8 /* CHAR_BIT */); else #endif ret = copy_from_user(new_mask, arg, len); if (ret) { free_cpumask_var(new_mask); return -EFAULT; } ret = __io_register_iowq_aff(ctx, new_mask); free_cpumask_var(new_mask); return ret; } static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { return __io_register_iowq_aff(ctx, NULL); } static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; int i, ret; if (copy_from_user(new_count, arg, sizeof(new_count))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i] > INT_MAX) return -EINVAL; if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold * a ref to the ctx. */ refcount_inc(&sqd->refs); mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); if (sqd->thread) tctx = sqd->thread->io_uring; } } else { tctx = current->io_uring; } BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; ctx->iowq_limits_set = true; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); if (ret) goto err; } else { memset(new_count, 0, sizeof(new_count)); } if (sqd) { mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; /* that's it for SQPOLL, only the SQPOLL task creates requests */ if (sqd) return 0; /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; for (i = 0; i < ARRAY_SIZE(new_count); i++) new_count[i] = ctx->iowq_limits[i]; /* ignore errors, it always returns zero anyway */ (void)io_wq_max_workers(tctx->io_wq, new_count); } return 0; err: if (sqd) { mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); mutex_lock(&ctx->uring_lock); } return ret; } static int io_register_clock(struct io_ring_ctx *ctx, struct io_uring_clock_register __user *arg) { struct io_uring_clock_register reg; if (copy_from_user(&reg, arg, sizeof(reg))) return -EFAULT; if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv))) return -EINVAL; switch (reg.clockid) { case CLOCK_MONOTONIC: ctx->clock_offset = 0; break; case CLOCK_BOOTTIME: ctx->clock_offset = TK_OFFS_BOOT; break; default: return -EINVAL; } ctx->clockid = reg.clockid; return 0; } /* * State to maintain until we can swap. Both new and old state, used for * either mapping or freeing. */ struct io_ring_ctx_rings { struct io_rings *rings; struct io_uring_sqe *sq_sqes; struct io_mapped_region sq_region; struct io_mapped_region ring_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_uring_params *p, struct io_ring_ctx_rings *r) { io_free_region(ctx, &r->sq_region); io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ do { \ (o).field = (ctx)->field; \ (ctx)->field = (n).field; \ } while (0) #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; struct io_uring_params p; int ret; /* for single issuer, must be owner resizing */ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; /* limited to DEFER_TASKRUN for now */ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) return -EINVAL; /* properties that are always inherited */ p.flags |= (ctx->flags & COPY_FLAGS); ret = io_uring_fill_params(p.sq_entries, &p); if (unlikely(ret)) return ret; /* nothing to do, but copy params back */ if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { if (copy_to_user(arg, &p, sizeof(p))) return -EFAULT; return 0; } size = rings_size(p.flags, p.sq_entries, p.cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); if (p.flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p.cq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; } n.rings = io_region_get_ptr(&n.ring_region); /* * At this point n.rings is shared with userspace, just like o.rings * is as well. While we don't expect userspace to modify it while * a resize is in progress, and it's most likely that userspace will * shoot itself in the foot if it does, we can't always assume good * intent... Use read/write once helpers from here on to indicate the * shared nature of it. */ WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { io_register_free_rings(ctx, &p, &n); return -EFAULT; } if (p.flags & IORING_SETUP_SQE128) size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(size); if (p.flags & IORING_SETUP_NO_MMAP) { rd.user_addr = p.sq_off.user_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); if (ret) { io_register_free_rings(ctx, &p, &n); return ret; } n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread */ if (ctx->sq_data) { mutex_unlock(&ctx->uring_lock); io_sq_thread_park(ctx->sq_data); mutex_lock(&ctx->uring_lock); } /* * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; o.sq_sqes = ctx->sq_sqes; ctx->sq_sqes = NULL; /* * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); if (tail - old_head > p.sq_entries) goto overflow; for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->sq_entries - 1); unsigned dst_head = i & (p.sq_entries - 1); n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } WRITE_ONCE(n.rings->sq.head, old_head); WRITE_ONCE(n.rings->sq.tail, tail); tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); if (tail - old_head > p.cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; ctx->sq_sqes = o.sq_sqes; to_free = &n; ret = -EOVERFLOW; goto out; } for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->cq_entries - 1); unsigned dst_head = i & (p.cq_entries - 1); n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } WRITE_ONCE(n.rings->cq.head, old_head); WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); /* all done, store old pointers and assign new ones */ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); ctx->sq_entries = p.sq_entries; ctx->cq_entries = p.cq_entries; ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); return ret; } static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) { struct io_uring_mem_region_reg __user *reg_uptr = uarg; struct io_uring_mem_region_reg reg; struct io_uring_region_desc __user *rd_uptr; struct io_uring_region_desc rd; int ret; if (io_region_is_set(&ctx->param_region)) return -EBUSY; if (copy_from_user(&reg, reg_uptr, sizeof(reg))) return -EFAULT; rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) return -EINVAL; /* * This ensures there are no waiters. Waiters are unlocked and it's * hard to synchronise with them, especially if we need to initialise * the region. */ if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { io_free_region(ctx, &ctx->param_region); return -EFAULT; } if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); ctx->cq_wait_size = rd.size; } return 0; } static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) __acquires(ctx->uring_lock) { int ret; /* * We don't quiesce the refs for register anymore and so it can't be * dying as we're holding a file ref here. */ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; if (ctx->restricted) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; } switch (opcode) { case IORING_REGISTER_BUFFERS: ret = -EFAULT; if (!arg) break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: ret = -EFAULT; if (!arg) break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_files_unregister(ctx); break; case IORING_REGISTER_FILES_UPDATE: ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 0); break; case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 1); break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; if (arg || nr_args) break; ret = io_eventfd_unregister(ctx); break; case IORING_REGISTER_PROBE: ret = -EINVAL; if (!arg || nr_args > 256) break; ret = io_probe(ctx, arg, nr_args); break; case IORING_REGISTER_PERSONALITY: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_personality(ctx); break; case IORING_UNREGISTER_PERSONALITY: ret = -EINVAL; if (arg) break; ret = io_unregister_personality(ctx, nr_args); break; case IORING_REGISTER_ENABLE_RINGS: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_enable_rings(ctx); break; case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; case IORING_REGISTER_FILES2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_FILES_UPDATE2: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_BUFFERS2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_BUFFERS_UPDATE: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_IOWQ_AFF: ret = -EINVAL; if (!arg || !nr_args) break; ret = io_register_iowq_aff(ctx, arg, nr_args); break; case IORING_UNREGISTER_IOWQ_AFF: ret = -EINVAL; if (arg || nr_args) break; ret = io_unregister_iowq_aff(ctx); break; case IORING_REGISTER_IOWQ_MAX_WORKERS: ret = -EINVAL; if (!arg || nr_args != 2) break; ret = io_register_iowq_max_workers(ctx, arg); break; case IORING_REGISTER_RING_FDS: ret = io_ringfd_register(ctx, arg, nr_args); break; case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; case IORING_REGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_pbuf_ring(ctx, arg); break; case IORING_UNREGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_unregister_pbuf_ring(ctx, arg); break; case IORING_REGISTER_SYNC_CANCEL: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_sync_cancel(ctx, arg); break; case IORING_REGISTER_FILE_ALLOC_RANGE: ret = -EINVAL; if (!arg || nr_args) break; ret = io_register_file_alloc_range(ctx, arg); break; case IORING_REGISTER_PBUF_STATUS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_pbuf_status(ctx, arg); break; case IORING_REGISTER_NAPI: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_napi(ctx, arg); break; case IORING_UNREGISTER_NAPI: ret = -EINVAL; if (nr_args != 1) break; ret = io_unregister_napi(ctx, arg); break; case IORING_REGISTER_CLOCK: ret = -EINVAL; if (!arg || nr_args) break; ret = io_register_clock(ctx, arg); break; case IORING_REGISTER_CLONE_BUFFERS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_clone_buffers(ctx, arg); break; case IORING_REGISTER_ZCRX_IFQ: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_zcrx_ifq(ctx, arg); break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_resize_rings(ctx, arg); break; case IORING_REGISTER_MEM_REGION: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_mem_region(ctx, arg); break; default: ret = -EINVAL; break; } return ret; } /* * Given an 'fd' value, return the ctx associated with if. If 'registered' is * true, then the registered index is used. Otherwise, the normal fd table. * Caller must call fput() on the returned file, unless it's an ERR_PTR. */ struct file *io_uring_register_get_file(unsigned int fd, bool registered) { struct file *file; if (registered) { /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; if (file) get_file(file); } else { file = fget(fd); } if (unlikely(!file)) return ERR_PTR(-EBADF); if (io_is_uring_fops(file)) return file; fput(file); return ERR_PTR(-EOPNOTSUPP); } /* * "blind" registration opcodes are ones where there's no ring given, and * hence the source fd must be -1. */ static int io_uring_register_blind(unsigned int opcode, void __user *arg, unsigned int nr_args) { switch (opcode) { case IORING_REGISTER_SEND_MSG_RING: { struct io_uring_sqe sqe; if (!arg || nr_args != 1) return -EINVAL; if (copy_from_user(&sqe, arg, sizeof(sqe))) return -EFAULT; /* no flags supported */ if (sqe.flags) return -EINVAL; if (sqe.opcode == IORING_OP_MSG_RING) return io_uring_sync_msg_ring(&sqe); } } return -EINVAL; } SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { struct io_ring_ctx *ctx; long ret = -EBADF; struct file *file; bool use_registered_ring; use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; if (fd == -1) return io_uring_register_blind(opcode, arg, nr_args); file = io_uring_register_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); ctx = file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); fput(file); return ret; }
189 188 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQ_H #define _LINUX_IRQ_H /* * Please do not include this file in generic code. There is currently * no requirement for any architecture to implement anything held * within this file. * * Thanks. --rmk */ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqhandler.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/topology.h> #include <linux/io.h> #include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> struct seq_file; struct module; struct msi_msg; struct irq_affinity_desc; enum irqchip_irq_state; /* * IRQ line status. * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * IRQ_TYPE_LEVEL_HIGH - high level triggered * IRQ_TYPE_LEVEL_LOW - low level triggered * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * IRQ_TYPE_SENSE_MASK - Mask for all the above bits * IRQ_TYPE_DEFAULT - For use by some PICs to ask irq_set_type * to setup the HW to a sane default (used * by irqdomain map() callbacks to synchronize * the HW state and SW flags for a newly * allocated descriptor). * * IRQ_TYPE_PROBE - Special flag for probing in progress * * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude * it from the spurious interrupt detection * mechanism and from core side polling. * IRQ_DISABLE_UNLAZY - Disable lazy irq disable * IRQ_HIDDEN - Don't show up in /proc/interrupts * IRQ_NO_DEBUG - Exclude from note_interrupt() debugging */ enum { IRQ_TYPE_NONE = 0x00000000, IRQ_TYPE_EDGE_RISING = 0x00000001, IRQ_TYPE_EDGE_FALLING = 0x00000002, IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), IRQ_TYPE_LEVEL_HIGH = 0x00000004, IRQ_TYPE_LEVEL_LOW = 0x00000008, IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), IRQ_TYPE_SENSE_MASK = 0x0000000f, IRQ_TYPE_DEFAULT = IRQ_TYPE_SENSE_MASK, IRQ_TYPE_PROBE = 0x00000010, IRQ_LEVEL = (1 << 8), IRQ_PER_CPU = (1 << 9), IRQ_NOPROBE = (1 << 10), IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), IRQ_IS_POLLED = (1 << 18), IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), }; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) /* * Return value for chip->irq_set_affinity() * * IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity * IRQ_SET_MASK_NOCOPY - OK, chip did update irq_common_data.affinity * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendant irqchips. */ enum { IRQ_SET_MASK_OK = 0, IRQ_SET_MASK_OK_NOCOPY, IRQ_SET_MASK_OK_DONE, }; struct msi_desc; struct irq_domain; /** * struct irq_common_data - per irq data shared by all irqchips * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @effective_affinity: The effective IRQ affinity on SMP as some irq * chips do not allow multi CPU destinations. * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; #ifdef CONFIG_SMP cpumask_var_t affinity; #endif #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK cpumask_var_t effective_affinity; #endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif }; /** * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. * @parent_data: pointer to parent struct irq_data to support hierarchy * irq_domain * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations */ struct irq_data { u32 mask; unsigned int irq; irq_hw_number_t hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data *parent_data; #endif void *chip_data; }; /* * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call * irq_chip::irq_set_affinity() when deactivated. * IRQD_IRQ_ENABLED_ON_SUSPEND - Interrupt is enabled on suspend by irq pm if * irqchip have flag IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND set. * IRQD_RESEND_WHEN_IN_PROGRESS - Interrupt may fire when already in progress in which * case it must be resent at the next available opportunity. */ enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = BIT(8), IRQD_ACTIVATED = BIT(9), IRQD_NO_BALANCING = BIT(10), IRQD_PER_CPU = BIT(11), IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), IRQD_WAKEUP_ARMED = BIT(19), IRQD_FORWARDED_TO_VCPU = BIT(20), IRQD_AFFINITY_MANAGED = BIT(21), IRQD_IRQ_STARTED = BIT(22), IRQD_MANAGED_SHUTDOWN = BIT(23), IRQD_SINGLE_TARGET = BIT(24), IRQD_DEFAULT_TRIGGER_SET = BIT(25), IRQD_CAN_RESERVE = BIT(26), IRQD_HANDLE_ENFORCE_IRQCTX = BIT(27), IRQD_AFFINITY_ON_ACTIVATE = BIT(28), IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(29), IRQD_RESEND_WHEN_IN_PROGRESS = BIT(30), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline bool irqd_trigger_type_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* * Must only be called inside irq_chip.irq_set_type() functions or * from the DT/ACPI setup code. */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; __irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET; } static inline bool irqd_is_level_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_LEVEL; } /* * Must only be called of irqchip.irq_set_affinity() or low level * hierarchy domain allocation functions. */ static inline void irqd_set_single_target(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SINGLE_TARGET; } static inline bool irqd_is_single_target(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) { __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) { return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_enabled_on_suspend(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_ENABLED_ON_SUSPEND; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } static inline bool irqd_is_forwarded_to_vcpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_FORWARDED_TO_VCPU; } static inline void irqd_set_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) |= IRQD_FORWARDED_TO_VCPU; } static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } static inline bool irqd_affinity_is_managed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } static inline bool irqd_is_activated(struct irq_data *d) { return __irqd_to_state(d) & IRQD_ACTIVATED; } static inline void irqd_set_activated(struct irq_data *d) { __irqd_to_state(d) |= IRQD_ACTIVATED; } static inline void irqd_clr_activated(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_ACTIVATED; } static inline bool irqd_is_started(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_STARTED; } static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } static inline void irqd_set_can_reserve(struct irq_data *d) { __irqd_to_state(d) |= IRQD_CAN_RESERVE; } static inline void irqd_clr_can_reserve(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; } static inline bool irqd_can_reserve(struct irq_data *d) { return __irqd_to_state(d) & IRQD_CAN_RESERVE; } static inline void irqd_set_affinity_on_activate(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; } static inline bool irqd_affinity_on_activate(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } static inline void irqd_set_resend_when_in_progress(struct irq_data *d) { __irqd_to_state(d) |= IRQD_RESEND_WHEN_IN_PROGRESS; } static inline bool irqd_needs_resend_when_in_progress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_RESEND_WHEN_IN_PROGRESS; } #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; } /** * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) * @irq_disable: disable the interrupt * @irq_ack: start of a new interrupt * @irq_mask: mask an interrupt source * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force * argument is true, it tells the driver to * unconditionally apply the affinity setting. Sanity * checks against the supplied affinity mask are not * required. This is used for CPU hotplug where the * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_suspend: function called from core code on suspend once per * chip, when one or more interrupts are installed * @irq_resume: function called from core code on resume once per chip, * when one ore more interrupts are installed * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @irq_request_resources: optional to request resources before calling * any other callback related to this irq * @irq_release_resources: optional to release resources acquired with * irq_request_resources * @irq_compose_msi_msg: optional to compose message content for MSI * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @ipi_send_single: send a single IPI to destination cpus * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); void (*irq_disable)(struct irq_data *data); void (*irq_ack)(struct irq_data *data); void (*irq_mask)(struct irq_data *data); void (*irq_mask_ack)(struct irq_data *data); void (*irq_unmask)(struct irq_data *data); void (*irq_eoi)(struct irq_data *data); int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); int (*irq_retrigger)(struct irq_data *data); int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); int (*irq_set_wake)(struct irq_data *data, unsigned int on); void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); #endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); int (*irq_request_resources)(struct irq_data *data); void (*irq_release_resources)(struct irq_data *data); void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg); void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg); int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); void (*irq_force_complete_move)(struct irq_data *data); unsigned long flags; }; /* * irq_chip specific flags * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI: Chip can provide two doorbells for Level MSIs * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip * IRQCHIP_MOVE_DEFERRED: Move the interrupt in actual interrupt context */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), IRQCHIP_MOVE_DEFERRED = (1 << 12), }; #include <linux/irqdesc.h> /* * Pick up the arch-dependent methods: */ #include <asm/hw_irq.h> #ifndef NR_IRQS_LEGACY # define NR_IRQS_LEGACY 0 #endif #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif #define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); #endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); extern int irq_affinity_online_cpu(unsigned int cpu); #else # define irq_affinity_online_cpu NULL #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { if (unlikely(irqd_is_setaffinity_pending(data))) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); #else static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } #endif extern int no_irq_affinity; #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq); #else static inline int irq_set_parent(int irq, int parent_irq) { return 0; } #endif /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); extern int irq_chip_set_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool val); extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); extern void irq_chip_mask_ack_parent(struct irq_data *data); extern void irq_chip_unmask_parent(struct irq_data *data); extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info); extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type); extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif /* Disable or mask interrupts during a kernel kexec */ extern void machine_kexec_mask_interrupts(void); /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ extern int can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name); static inline void irq_set_chip_and_handler(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); } extern int irq_set_percpu_devid(unsigned int irq); extern int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity); extern int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); static inline void irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 0, NULL); } /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 1, NULL); } /* * Set a highlevel chained flow handler and its data for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) { irq_modify_status(irq, 0, set); } static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) { irq_modify_status(irq, clr, 0); } static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } static inline void irq_set_nothread(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOTHREAD); } static inline void irq_set_thread(unsigned int irq) { irq_modify_status(irq, IRQ_NOTHREAD, 0); } static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) irq_set_status_flags(irq, IRQ_NESTED_THREAD); else irq_clear_status_flags(irq, IRQ_NESTED_THREAD); } static inline void irq_set_percpu_devid_flags(unsigned int irq) { irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD | IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, const struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; } static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) { return d->chip; } static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; } static inline void *irq_data_get_irq_chip_data(struct irq_data *d) { return d->chip_data; } static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->handler_data : NULL; } static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->common->handler_data; } static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->msi_desc : NULL; } static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d) { return d->common->msi_desc; } static inline u32 irq_get_trigger_type(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irqd_get_trigger_type(d) : 0; } static inline int irq_common_data_get_node(struct irq_common_data *d) { #ifdef CONFIG_NUMA return d->node; #else return 0; #endif } static inline int irq_data_get_node(struct irq_data *d) { return irq_common_data_get_node(d->common); } static inline const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) { #ifdef CONFIG_SMP return d->common->affinity; #else return cpumask_of(0); #endif } static inline void irq_data_update_affinity(struct irq_data *d, const struct cpumask *m) { #ifdef CONFIG_SMP cpumask_copy(d->common->affinity, m); #endif } static inline const struct cpumask *irq_get_affinity_mask(int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_affinity_mask(d) : NULL; } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { cpumask_copy(d->common->effective_affinity, m); } #else static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { } static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return irq_data_get_affinity_mask(d); } #endif static inline const struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_effective_affinity_mask(d) : NULL; } unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 1, 1, node) #define irq_alloc_desc_at(at, node) \ irq_alloc_descs(at, at, 1, node) #define irq_alloc_desc_from(from, node) \ irq_alloc_descs(-1, from, 1, node) #define irq_alloc_descs_from(from, cnt, node) \ irq_alloc_descs(-1, from, cnt, node) #define devm_irq_alloc_descs(dev, irq, from, cnt, node) \ __devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL) #define devm_irq_alloc_desc(dev, node) \ devm_irq_alloc_descs(dev, -1, 1, 1, node) #define devm_irq_alloc_desc_at(dev, at, node) \ devm_irq_alloc_descs(dev, at, at, 1, node) #define devm_irq_alloc_desc_from(dev, from, node) \ devm_irq_alloc_descs(dev, -1, from, 1, node) #define devm_irq_alloc_descs_from(dev, from, cnt, node) \ devm_irq_alloc_descs(dev, -1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base * @disable: Disable register offset to reg_base * @mask: Mask register offset to reg_base * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; unsigned long disable; unsigned long mask; unsigned long ack; unsigned long eoi; unsigned long type; }; /** * struct irq_chip_type - Generic interrupt chip instance for a flow type * @chip: The real interrupt chip which provides the callbacks * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types * @mask_cache_priv: Cached mask register private to the chip type * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different * flow types. */ struct irq_chip_type { struct irq_chip chip; struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; u32 mask_cache_priv; u32 *mask_cache; }; /** * struct irq_chip_generic - Generic irq chip data structure * @lock: Lock to protect register and cache data access * @reg_base: Register base address (virtual) * @reg_readl: Alternate I/O accessor (defaults to readl if NULL) * @reg_writel: Alternate I/O accessor (defaults to writel if NULL) * @suspend: Function called from core code on suspend once per * chip; can be useful instead of irq_chip::suspend to * handle chip details even when no interrupts are in use * @resume: Function called from core code on resume once per chip; * can be useful instead of irq_chip::suspend to handle * chip details even when no interrupts are in use * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type * implementations which can be associated to a particular irq line of * an irq_chip_generic instance. That allows to share and protect * state in an irq_chip_generic instance when we need to implement * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { raw_spinlock_t lock; void __iomem *reg_base; u32 (*reg_readl)(void __iomem *addr); void (*reg_writel)(u32 val, void __iomem *addr); void (*suspend)(struct irq_chip_generic *gc); void (*resume)(struct irq_chip_generic *gc); unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; void *private; unsigned long installed; unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[]; }; /** * enum irq_gc_flags - Initialization flags for generic irq chips * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask * @IRQ_GC_BE_IO: Use big-endian register accesses (default: LE) */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, IRQ_GC_NO_MASK = 1 << 3, IRQ_GC_BE_IO = 1 << 4, }; /* * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains * @irqs_per_chip: Number of interrupts per chip * @num_chips: Number of chips * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags * @exit: Function called on each chip when they are destroyed. * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { unsigned int irqs_per_chip; unsigned int num_chips; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; void (*exit)(struct irq_chip_generic *gc); struct irq_chip_generic *gc[]; }; /** * struct irq_domain_chip_generic_info - Generic chip information structure * @name: Name of the generic interrupt chip * @handler: Interrupt handler used by the generic interrupt chip * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with each * chip * @irq_flags_to_clear: IRQ_* bits to clear in the mapping function * @irq_flags_to_set: IRQ_* bits to set in the mapping function * @gc_flags: Generic chip specific setup flags * @init: Function called on each chip when they are created. * Allow to do some additional chip initialisation. * @exit: Function called on each chip when they are destroyed. * Allow to do some chip cleanup operation. */ struct irq_domain_chip_generic_info { const char *name; irq_flow_handler_t handler; unsigned int irqs_per_chip; unsigned int num_ct; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; int (*init)(struct irq_chip_generic *gc); void (*exit)(struct irq_chip_generic *gc); }; /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); void irq_gc_mask_set_bit(struct irq_data *d); void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); /* Setup functions for irq_chip_generic */ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq); void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq); struct irq_chip_generic * irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); #ifdef CONFIG_GENERIC_IRQ_CHIP int irq_domain_alloc_generic_chips(struct irq_domain *d, const struct irq_domain_chip_generic_info *info); void irq_domain_remove_generic_chips(struct irq_domain *d); #else static inline int irq_domain_alloc_generic_chips(struct irq_domain *d, const struct irq_domain_chip_generic_info *info) { return -EINVAL; } static inline void irq_domain_remove_generic_chips(struct irq_domain *d) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, unsigned int clr, unsigned int set, enum irq_gc_flags flags); #define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ handler, clr, set, flags) \ ({ \ MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ handler, clr, set, flags); \ }) static inline void irq_free_generic_chip(struct irq_chip_generic *gc) { kfree(gc); } static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { irq_remove_generic_chip(gc, msk, clr, set); irq_free_generic_chip(gc); } static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); } #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) #ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } #else static inline void irq_gc_lock(struct irq_chip_generic *gc) { } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } #endif /* * The irqsave variants are for usage in non interrupt code. Do not use * them in irq_chip callbacks. Use irq_gc_lock() instead. */ #define irq_gc_lock_irqsave(gc, flags) \ raw_spin_lock_irqsave(&(gc)->lock, flags) #define irq_gc_unlock_irqrestore(gc, flags) \ raw_spin_unlock_irqrestore(&(gc)->lock, flags) static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { if (gc->reg_writel) gc->reg_writel(val, gc->reg_base + reg_offset); else writel(val, gc->reg_base + reg_offset); } static inline u32 irq_reg_readl(struct irq_chip_generic *gc, int reg_offset) { if (gc->reg_readl) return gc->reg_readl(gc->reg_base + reg_offset); else return readl(gc->reg_base + reg_offset); } struct irq_matrix; struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end); void irq_matrix_online(struct irq_matrix *m); void irq_matrix_offline(struct irq_matrix *m); void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu); void irq_matrix_reserve(struct irq_matrix *m); void irq_matrix_remove_reserved(struct irq_matrix *m); int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu); void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed); void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); unsigned int irq_matrix_allocated(struct irq_matrix *m); unsigned int irq_matrix_reserved(struct irq_matrix *m); void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); void ipi_mux_process(void); int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)); #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in * the system, which is generally the first C code called from an assembly * architecture-specific interrupt handler. * * Returns 0 on success, or -EBUSY if an IRQ handler has already been * registered. */ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); /* * Allows interrupt handlers to find the irqchip that's been registered as the * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ do { \ (void)handle_irq; \ WARN_ON(1); \ } while (0) #endif #endif #endif /* _LINUX_IRQ_H */
3 10 2 3 36 15 21 18 6 6 6 3 36 36 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "queueing.h" #include "device.h" #include "peer.h" #include "timers.h" #include "messages.h" #include "cookie.h" #include "socket.h" #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include <net/ip_tunnels.h> /* Must be called with bh disabled. */ static void update_rx_stats(struct wg_peer *peer, size_t len) { dev_sw_netstats_rx_add(peer->device->dev, len); peer->rx_bytes += len; } #define SKB_TYPE_LE32(skb) (((struct message_header *)(skb)->data)->type) static size_t validate_header_len(struct sk_buff *skb) { if (unlikely(skb->len < sizeof(struct message_header))) return 0; if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_DATA) && skb->len >= MESSAGE_MINIMUM_LENGTH) return sizeof(struct message_data); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION) && skb->len == sizeof(struct message_handshake_initiation)) return sizeof(struct message_handshake_initiation); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE) && skb->len == sizeof(struct message_handshake_response)) return sizeof(struct message_handshake_response); if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE) && skb->len == sizeof(struct message_handshake_cookie)) return sizeof(struct message_handshake_cookie); return 0; } static int prepare_skb_header(struct sk_buff *skb, struct wg_device *wg) { size_t data_offset, data_len, header_len; struct udphdr *udp; if (unlikely(!wg_check_packet_protocol(skb) || skb_transport_header(skb) < skb->head || (skb_transport_header(skb) + sizeof(struct udphdr)) > skb_tail_pointer(skb))) return -EINVAL; /* Bogus IP header */ udp = udp_hdr(skb); data_offset = (u8 *)udp - skb->data; if (unlikely(data_offset > U16_MAX || data_offset + sizeof(struct udphdr) > skb->len)) /* Packet has offset at impossible location or isn't big enough * to have UDP fields. */ return -EINVAL; data_len = ntohs(udp->len); if (unlikely(data_len < sizeof(struct udphdr) || data_len > skb->len - data_offset)) /* UDP packet is reporting too small of a size or lying about * its size. */ return -EINVAL; data_len -= sizeof(struct udphdr); data_offset = (u8 *)udp + sizeof(struct udphdr) - skb->data; if (unlikely(!pskb_may_pull(skb, data_offset + sizeof(struct message_header)) || pskb_trim(skb, data_len + data_offset) < 0)) return -EINVAL; skb_pull(skb, data_offset); if (unlikely(skb->len != data_len)) /* Final len does not agree with calculated len */ return -EINVAL; header_len = validate_header_len(skb); if (unlikely(!header_len)) return -EINVAL; __skb_push(skb, data_offset); if (unlikely(!pskb_may_pull(skb, data_offset + header_len))) return -EINVAL; __skb_pull(skb, data_offset); return 0; } static void wg_receive_handshake_packet(struct wg_device *wg, struct sk_buff *skb) { enum cookie_mac_state mac_state; struct wg_peer *peer = NULL; /* This is global, so that our load calculation applies to the whole * system. We don't care about races with it at all. */ static u64 last_under_load; bool packet_needs_cookie; bool under_load; if (SKB_TYPE_LE32(skb) == cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE)) { net_dbg_skb_ratelimited("%s: Receiving cookie response from %pISpfsc\n", wg->dev->name, skb); wg_cookie_message_consume( (struct message_handshake_cookie *)skb->data, wg); return; } under_load = atomic_read(&wg->handshake_queue_len) >= MAX_QUEUED_INCOMING_HANDSHAKES / 8; if (under_load) { last_under_load = ktime_get_coarse_boottime_ns(); } else if (last_under_load) { under_load = !wg_birthdate_has_expired(last_under_load, 1); if (!under_load) last_under_load = 0; } mac_state = wg_cookie_validate_packet(&wg->cookie_checker, skb, under_load); if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) || (!under_load && mac_state == VALID_MAC_BUT_NO_COOKIE)) { packet_needs_cookie = false; } else if (under_load && mac_state == VALID_MAC_BUT_NO_COOKIE) { packet_needs_cookie = true; } else { net_dbg_skb_ratelimited("%s: Invalid MAC of handshake, dropping packet from %pISpfsc\n", wg->dev->name, skb); return; } switch (SKB_TYPE_LE32(skb)) { case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): { struct message_handshake_initiation *message = (struct message_handshake_initiation *)skb->data; if (packet_needs_cookie) { wg_packet_send_handshake_cookie(wg, skb, message->sender_index); return; } peer = wg_noise_handshake_consume_initiation(message, wg); if (unlikely(!peer)) { net_dbg_skb_ratelimited("%s: Invalid handshake initiation from %pISpfsc\n", wg->dev->name, skb); return; } wg_socket_set_peer_endpoint_from_skb(peer, skb); net_dbg_ratelimited("%s: Receiving handshake initiation from peer %llu (%pISpfsc)\n", wg->dev->name, peer->internal_id, &peer->endpoint.addr); wg_packet_send_handshake_response(peer); break; } case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): { struct message_handshake_response *message = (struct message_handshake_response *)skb->data; if (packet_needs_cookie) { wg_packet_send_handshake_cookie(wg, skb, message->sender_index); return; } peer = wg_noise_handshake_consume_response(message, wg); if (unlikely(!peer)) { net_dbg_skb_ratelimited("%s: Invalid handshake response from %pISpfsc\n", wg->dev->name, skb); return; } wg_socket_set_peer_endpoint_from_skb(peer, skb); net_dbg_ratelimited("%s: Receiving handshake response from peer %llu (%pISpfsc)\n", wg->dev->name, peer->internal_id, &peer->endpoint.addr); if (wg_noise_handshake_begin_session(&peer->handshake, &peer->keypairs)) { wg_timers_session_derived(peer); wg_timers_handshake_complete(peer); /* Calling this function will either send any existing * packets in the queue and not send a keepalive, which * is the best case, Or, if there's nothing in the * queue, it will send a keepalive, in order to give * immediate confirmation of the session. */ wg_packet_send_keepalive(peer); } break; } } if (unlikely(!peer)) { WARN(1, "Somehow a wrong type of packet wound up in the handshake queue!\n"); return; } local_bh_disable(); update_rx_stats(peer, skb->len); local_bh_enable(); wg_timers_any_authenticated_packet_received(peer); wg_timers_any_authenticated_packet_traversal(peer); wg_peer_put(peer); } void wg_packet_handshake_receive_worker(struct work_struct *work) { struct crypt_queue *queue = container_of(work, struct multicore_worker, work)->ptr; struct wg_device *wg = container_of(queue, struct wg_device, handshake_queue); struct sk_buff *skb; while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { wg_receive_handshake_packet(wg, skb); dev_kfree_skb(skb); atomic_dec(&wg->handshake_queue_len); cond_resched(); } } static void keep_key_fresh(struct wg_peer *peer) { struct noise_keypair *keypair; bool send; if (peer->sent_lastminute_handshake) return; rcu_read_lock_bh(); keypair = rcu_dereference_bh(peer->keypairs.current_keypair); send = keypair && READ_ONCE(keypair->sending.is_valid) && keypair->i_am_the_initiator && wg_birthdate_has_expired(keypair->sending.birthdate, REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT); rcu_read_unlock_bh(); if (unlikely(send)) { peer->sent_lastminute_handshake = true; wg_packet_send_queued_handshake_initiation(peer, false); } } static bool decrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) { struct scatterlist sg[MAX_SKB_FRAGS + 8]; struct sk_buff *trailer; unsigned int offset; int num_frags; if (unlikely(!keypair)) return false; if (unlikely(!READ_ONCE(keypair->receiving.is_valid) || wg_birthdate_has_expired(keypair->receiving.birthdate, REJECT_AFTER_TIME) || READ_ONCE(keypair->receiving_counter.counter) >= REJECT_AFTER_MESSAGES)) { WRITE_ONCE(keypair->receiving.is_valid, false); return false; } PACKET_CB(skb)->nonce = le64_to_cpu(((struct message_data *)skb->data)->counter); /* We ensure that the network header is part of the packet before we * call skb_cow_data, so that there's no chance that data is removed * from the skb, so that later we can extract the original endpoint. */ offset = -skb_network_offset(skb); skb_push(skb, offset); num_frags = skb_cow_data(skb, 0, &trailer); offset += sizeof(struct message_data); skb_pull(skb, offset); if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) return false; sg_init_table(sg, num_frags); if (skb_to_sgvec(skb, sg, 0, skb->len) <= 0) return false; if (!chacha20poly1305_decrypt_sg_inplace(sg, skb->len, NULL, 0, PACKET_CB(skb)->nonce, keypair->receiving.key)) return false; /* Another ugly situation of pushing and pulling the header so as to * keep endpoint information intact. */ skb_push(skb, offset); if (pskb_trim(skb, skb->len - noise_encrypted_len(0))) return false; skb_pull(skb, offset); return true; } /* This is RFC6479, a replay detection bitmap algorithm that avoids bitshifts */ static bool counter_validate(struct noise_replay_counter *counter, u64 their_counter) { unsigned long index, index_current, top, i; bool ret = false; spin_lock_bh(&counter->lock); if (unlikely(counter->counter >= REJECT_AFTER_MESSAGES + 1 || their_counter >= REJECT_AFTER_MESSAGES)) goto out; ++their_counter; if (unlikely((COUNTER_WINDOW_SIZE + their_counter) < counter->counter)) goto out; index = their_counter >> ilog2(BITS_PER_LONG); if (likely(their_counter > counter->counter)) { index_current = counter->counter >> ilog2(BITS_PER_LONG); top = min_t(unsigned long, index - index_current, COUNTER_BITS_TOTAL / BITS_PER_LONG); for (i = 1; i <= top; ++i) counter->backtrack[(i + index_current) & ((COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1)] = 0; WRITE_ONCE(counter->counter, their_counter); } index &= (COUNTER_BITS_TOTAL / BITS_PER_LONG) - 1; ret = !test_and_set_bit(their_counter & (BITS_PER_LONG - 1), &counter->backtrack[index]); out: spin_unlock_bh(&counter->lock); return ret; } #include "selftest/counter.c" static void wg_packet_consume_data_done(struct wg_peer *peer, struct sk_buff *skb, struct endpoint *endpoint) { struct net_device *dev = peer->device->dev; unsigned int len, len_before_trim; struct wg_peer *routed_peer; wg_socket_set_peer_endpoint(peer, endpoint); if (unlikely(wg_noise_received_with_keypair(&peer->keypairs, PACKET_CB(skb)->keypair))) { wg_timers_handshake_complete(peer); wg_packet_send_staged_packets(peer); } keep_key_fresh(peer); wg_timers_any_authenticated_packet_received(peer); wg_timers_any_authenticated_packet_traversal(peer); /* A packet with length 0 is a keepalive packet */ if (unlikely(!skb->len)) { update_rx_stats(peer, message_data_len(0)); net_dbg_ratelimited("%s: Receiving keepalive packet from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); goto packet_processed; } wg_timers_data_received(peer); if (unlikely(skb_network_header(skb) < skb->head)) goto dishonest_packet_size; if (unlikely(!(pskb_network_may_pull(skb, sizeof(struct iphdr)) && (ip_hdr(skb)->version == 4 || (ip_hdr(skb)->version == 6 && pskb_network_may_pull(skb, sizeof(struct ipv6hdr))))))) goto dishonest_packet_type; skb->dev = dev; /* We've already verified the Poly1305 auth tag, which means this packet * was not modified in transit. We can therefore tell the networking * stack that all checksums of every layer of encapsulation have already * been checked "by the hardware" and therefore is unnecessary to check * again in software. */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = ~0; /* All levels */ skb->protocol = ip_tunnel_parse_protocol(skb); if (skb->protocol == htons(ETH_P_IP)) { len = ntohs(ip_hdr(skb)->tot_len); if (unlikely(len < sizeof(struct iphdr))) goto dishonest_packet_size; INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ip_hdr(skb)->tos); } else if (skb->protocol == htons(ETH_P_IPV6)) { len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ipv6_get_dsfield(ipv6_hdr(skb))); } else { goto dishonest_packet_type; } if (unlikely(len > skb->len)) goto dishonest_packet_size; len_before_trim = skb->len; if (unlikely(pskb_trim(skb, len))) goto packet_processed; routed_peer = wg_allowedips_lookup_src(&peer->device->peer_allowedips, skb); wg_peer_put(routed_peer); /* We don't need the extra reference. */ if (unlikely(routed_peer != peer)) goto dishonest_packet_peer; napi_gro_receive(&peer->napi, skb); update_rx_stats(peer, message_data_len(len_before_trim)); return; dishonest_packet_peer: net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n", dev->name, skb, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_type: net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_size: net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_length_errors); goto packet_processed; packet_processed: dev_kfree_skb(skb); } int wg_packet_rx_poll(struct napi_struct *napi, int budget) { struct wg_peer *peer = container_of(napi, struct wg_peer, napi); struct noise_keypair *keypair; struct endpoint endpoint; enum packet_state state; struct sk_buff *skb; int work_done = 0; bool free; if (unlikely(budget <= 0)) return 0; while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != PACKET_STATE_UNCRYPTED) { wg_prev_queue_drop_peeked(&peer->rx_queue); keypair = PACKET_CB(skb)->keypair; free = true; if (unlikely(state != PACKET_STATE_CRYPTED)) goto next; if (unlikely(!counter_validate(&keypair->receiving_counter, PACKET_CB(skb)->nonce))) { net_dbg_ratelimited("%s: Packet has invalid nonce %llu (max %llu)\n", peer->device->dev->name, PACKET_CB(skb)->nonce, READ_ONCE(keypair->receiving_counter.counter)); goto next; } if (unlikely(wg_socket_endpoint_from_skb(&endpoint, skb))) goto next; wg_reset_packet(skb, false); wg_packet_consume_data_done(peer, skb, &endpoint); free = false; next: wg_noise_keypair_put(keypair, false); wg_peer_put(peer); if (unlikely(free)) dev_kfree_skb(skb); if (++work_done >= budget) break; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } void wg_packet_decrypt_worker(struct work_struct *work) { struct crypt_queue *queue = container_of(work, struct multicore_worker, work)->ptr; struct sk_buff *skb; while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { enum packet_state state = likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; wg_queue_enqueue_per_peer_rx(skb, state); if (need_resched()) cond_resched(); } } static void wg_packet_consume_data(struct wg_device *wg, struct sk_buff *skb) { __le32 idx = ((struct message_data *)skb->data)->key_idx; struct wg_peer *peer = NULL; int ret; rcu_read_lock_bh(); PACKET_CB(skb)->keypair = (struct noise_keypair *)wg_index_hashtable_lookup( wg->index_hashtable, INDEX_HASHTABLE_KEYPAIR, idx, &peer); if (unlikely(!wg_noise_keypair_get(PACKET_CB(skb)->keypair))) goto err_keypair; if (unlikely(READ_ONCE(peer->is_dead))) goto err; ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, wg->packet_crypt_wq); if (unlikely(ret == -EPIPE)) wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); if (likely(!ret || ret == -EPIPE)) { rcu_read_unlock_bh(); return; } err: wg_noise_keypair_put(PACKET_CB(skb)->keypair, false); err_keypair: rcu_read_unlock_bh(); wg_peer_put(peer); dev_kfree_skb(skb); } void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb) { if (unlikely(prepare_skb_header(skb, wg) < 0)) goto err; switch (SKB_TYPE_LE32(skb)) { case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): case cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE): { int cpu, ret = -EBUSY; if (unlikely(!rng_is_initialized())) goto drop; if (atomic_read(&wg->handshake_queue_len) > MAX_QUEUED_INCOMING_HANDSHAKES / 2) { if (spin_trylock_bh(&wg->handshake_queue.ring.producer_lock)) { ret = __ptr_ring_produce(&wg->handshake_queue.ring, skb); spin_unlock_bh(&wg->handshake_queue.ring.producer_lock); } } else ret = ptr_ring_produce_bh(&wg->handshake_queue.ring, skb); if (ret) { drop: net_dbg_skb_ratelimited("%s: Dropping handshake packet from %pISpfsc\n", wg->dev->name, skb); goto err; } atomic_inc(&wg->handshake_queue_len); cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu); /* Queues up a call to packet_process_queued_handshake_packets(skb): */ queue_work_on(cpu, wg->handshake_receive_wq, &per_cpu_ptr(wg->handshake_queue.worker, cpu)->work); break; } case cpu_to_le32(MESSAGE_DATA): PACKET_CB(skb)->ds = ip_tunnel_get_dsfield(ip_hdr(skb), skb); wg_packet_consume_data(wg, skb); break; default: WARN(1, "Non-exhaustive parsing of packet header lead to unknown packet type!\n"); goto err; } return; err: dev_kfree_skb(skb); }
161 40 150 40 150 32 154 1 85 28 153 44 2 2 2 2 153 1 3 121 3 121 83 96 96 59 70 95 120 84 1 120 120 35 1 36 100 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map.m_pblk; else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_page - 1; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); }
124 35 124 21 134 71 83 145 6 71 31 16 31 31 32 120 120 13 11 4 4 3 76 76 2 71 76 32 45 149 33 115 42 73 44 44 44 43 15 15 15 15 4 4 4 41 115 148 5 145 4 144 145 147 119 63 56 58 58 58 5 5 5 5 5 31 118 3 5 9 141 23 16 17 1 5 5 32 158 11 1 147 42 42 34 13 23 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> * * Thanks to Ingo Molnar for his many suggestions. * * Authors: Alan Stern <stern@rowland.harvard.edu> * K.Prasad <prasad@linux.vnet.ibm.com> * Frederic Weisbecker <fweisbec@gmail.com> */ /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. * This file contains the arch-independent routines. */ #include <linux/hw_breakpoint.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/percpu-rwsem.h> #include <linux/percpu.h> #include <linux/rhashtable.h> #include <linux/sched.h> #include <linux/slab.h> /* * Datastructure to track the total uses of N slots across tasks or CPUs; * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. */ struct bp_slots_histogram { #ifdef hw_breakpoint_slots atomic_t count[hw_breakpoint_slots(0)]; #else atomic_t *count; #endif }; /* * Per-CPU constraints data. */ struct bp_cpuinfo { /* Number of pinned CPU breakpoints in a CPU. */ unsigned int cpu_pinned; /* Histogram of pinned task breakpoints in a CPU. */ struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } /* Number of pinned CPU breakpoints globally. */ static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; /* Number of pinned CPU-independent task breakpoints. */ static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; static const struct rhashtable_params task_bps_ht_params = { .head_offset = offsetof(struct hw_perf_event, bp_list), .key_offset = offsetof(struct hw_perf_event, target), .key_len = sizeof_field(struct hw_perf_event, target), .automatic_shrinking = true, }; static bool constraints_initialized __ro_after_init; /* * Synchronizes accesses to the per-CPU constraints; the locking rules are: * * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock * (due to bp_slots_histogram::count being atomic, no update are lost). * * 2. Holding a write-lock is required for computations that require a * stable snapshot of all bp_cpuinfo::tsk_pinned. * * 3. In all other cases, non-atomic accesses require the appropriately held * lock (read-lock for read-only accesses; write-lock for reads/writes). */ DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); /* * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since * rhltable synchronizes concurrent insertions/deletions, independent tasks may * insert/delete concurrently; therefore, a mutex per task is sufficient. * * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is * that hw_breakpoint may contend with per-task perf event list management. The * assumption is that perf usecases involving hw_breakpoints are very unlikely * to result in unnecessary contention. */ static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) { struct task_struct *tsk = bp->hw.target; return tsk ? &tsk->perf_event_mutex : NULL; } static struct mutex *bp_constraints_lock(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) { /* * Fully analogous to the perf_try_init_event() nesting * argument in the comment near perf_event_ctx_lock_nested(); * this child->perf_event_mutex cannot ever deadlock against * the parent->perf_event_mutex usage from * perf_event_task_{en,dis}able(). * * Specifically, inherited events will never occur on * ->perf_event_list. */ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); percpu_down_read(&bp_cpuinfo_sem); } else { percpu_down_write(&bp_cpuinfo_sem); } return tsk_mtx; } static void bp_constraints_unlock(struct mutex *tsk_mtx) { if (tsk_mtx) { percpu_up_read(&bp_cpuinfo_sem); mutex_unlock(tsk_mtx); } else { percpu_up_write(&bp_cpuinfo_sem); } } static bool bp_constraints_is_locked(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); return percpu_is_write_locked(&bp_cpuinfo_sem) || (tsk_mtx ? mutex_is_locked(tsk_mtx) : percpu_is_read_locked(&bp_cpuinfo_sem)); } static inline void assert_bp_constraints_lock_held(struct perf_event *bp) { struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) lockdep_assert_held(tsk_mtx); lockdep_assert_held(&bp_cpuinfo_sem); } #ifdef hw_breakpoint_slots /* * Number of breakpoint slots is constant, and the same for all types. */ static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } static inline int init_breakpoint_slots(void) { return 0; } #else /* * Dynamic number of breakpoint slots. */ static int __nr_bp_slots[TYPE_MAX] __ro_after_init; static inline int hw_breakpoint_slots_cached(int type) { return __nr_bp_slots[type]; } static __init bool bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) { hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); return hist->count; } static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) { kfree(hist->count); } static __init int init_breakpoint_slots(void) { int i, cpu, err_cpu; for (i = 0; i < TYPE_MAX; i++) __nr_bp_slots[i] = hw_breakpoint_slots(i); for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) goto err; } } for (i = 0; i < TYPE_MAX; i++) { if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) goto err; if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) goto err; } return 0; err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } for (i = 0; i < TYPE_MAX; i++) { bp_slots_histogram_free(&cpu_pinned[i]); bp_slots_histogram_free(&tsk_pinned_all[i]); } return -ENOMEM; } #endif static inline void bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) { const int old_idx = old - 1; const int new_idx = old_idx + val; if (old_idx >= 0) WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); if (new_idx >= 0) WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); } static int bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) { for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { const int count = atomic_read(&hist->count[i]); /* Catch unexpected writers; we want a stable snapshot. */ ASSERT_EXCLUSIVE_WRITER(hist->count[i]); if (count > 0) return i + 1; WARN(count < 0, "inconsistent breakpoint slots histogram"); } return 0; } static int bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, enum bp_type_idx type) { for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { const int count1 = atomic_read(&hist1->count[i]); const int count2 = atomic_read(&hist2->count[i]); /* Catch unexpected writers; we want a stable snapshot. */ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); if (count1 + count2 > 0) return i + 1; WARN(count1 < 0, "inconsistent breakpoint slots histogram"); WARN(count2 < 0, "inconsistent breakpoint slots histogram"); } return 0; } #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } #endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { if (bp_type & HW_BREAKPOINT_RW) return TYPE_DATA; return TYPE_INST; } /* * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * At this point we want to have acquired the bp_cpuinfo_sem as a * writer to ensure that there are no concurrent writers in * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. * * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; /* * We need a stable snapshot of the per-task breakpoint list. */ assert_bp_constraints_lock_held(bp); rcu_read_lock(); head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); if (!head) goto out; rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { if (find_slot_idx(iter->attr.bp_type) != type) continue; if (iter->cpu >= 0) { if (cpu == -1) { count = -1; goto out; } else if (cpu != iter->cpu) continue; } count += hw_breakpoint_weight(iter); } out: rcu_read_unlock(); return count; } static const struct cpumask *cpumask_of_bp(struct perf_event *bp) { if (bp->cpu >= 0) return cpumask_of(bp->cpu); return cpu_possible_mask; } /* * Returns the max pinned breakpoint slots in a given * CPU (cpu > -1) or across all of them (cpu = -1). */ static int max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); int pinned_slots = 0; int cpu; if (bp->hw.target && bp->cpu < 0) { int max_pinned = task_bp_pinned(-1, bp, type); if (max_pinned >= 0) { /* * Fast path: task_bp_pinned() is CPU-independent and * returns the same value for any CPU. */ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); return max_pinned; } } for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; nr = info->cpu_pinned; if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); pinned_slots = max(nr, pinned_slots); } return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu, next_tsk_pinned; if (!enable) weight = -weight; if (!bp->hw.target) { /* * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the * global histogram. */ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); lockdep_assert_held_write(&bp_cpuinfo_sem); bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); info->cpu_pinned += weight; return 0; } /* * If bp->hw.target, tsk_pinned is only modified, but not used * otherwise. We can permit concurrent updates as long as there are no * other uses: having acquired bp_cpuinfo_sem as a reader allows * concurrent updates here. Uses of tsk_pinned will require acquiring * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. */ lockdep_assert_held_read(&bp_cpuinfo_sem); /* * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global * histogram. We need to take care of 4 cases: * * 1. This breakpoint targets all CPUs (cpu < 0), and there may only * exist other task breakpoints targeting all CPUs. In this case we * can simply update the global slots histogram. * * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may * only exist other task breakpoints targeting all CPUs. * * a. On enable: remove the existing breakpoints from the global * slots histogram and use the per-CPU histogram. * * b. On disable: re-insert the existing breakpoints into the global * slots histogram and remove from per-CPU histogram. * * 3. Some other existing task breakpoints target specific CPUs. Only * update the per-CPU slots histogram. */ if (!enable) { /* * Remove before updating histograms so we can determine if this * was the last task breakpoint for a specific CPU. */ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); if (ret) return ret; } /* * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. */ next_tsk_pinned = task_bp_pinned(-1, bp, type); if (next_tsk_pinned >= 0) { if (bp->cpu < 0) { /* Case 1: fast path */ if (!enable) next_tsk_pinned += hw_breakpoint_weight(bp); bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); } else if (enable) { /* Case 2.a: slow path */ /* Add existing to per-CPU histograms. */ for_each_possible_cpu(cpu) { bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, 0, next_tsk_pinned); } /* Add this first CPU-pinned task breakpoint. */ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, next_tsk_pinned, weight); /* Rebalance global task pinned histogram. */ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, -next_tsk_pinned); } else { /* Case 2.b: slow path */ /* Remove this last CPU-pinned task breakpoint. */ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, next_tsk_pinned + hw_breakpoint_weight(bp), weight); /* Remove all from per-CPU histograms. */ for_each_possible_cpu(cpu) { bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, next_tsk_pinned, -next_tsk_pinned); } /* Rebalance global task pinned histogram. */ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); } } else { /* Case 3: slow path */ const struct cpumask *cpumask = cpumask_of_bp(bp); for_each_cpu(cpu, cpumask) { next_tsk_pinned = task_bp_pinned(cpu, bp, type); if (!enable) next_tsk_pinned += hw_breakpoint_weight(bp); bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, next_tsk_pinned, weight); } } /* * Readers want a stable snapshot of the per-task breakpoint list. */ assert_bp_constraints_lock_held(bp); if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); return 0; } /* * Constraints to check before allowing this new breakpoint counter. * * Note: Flexible breakpoints are currently unimplemented, but outlined in the * below algorithm for completeness. The implementation treats flexible as * pinned due to no guarantee that we currently always schedule flexible events * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * * - If attached to a single cpu, check: * * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. * Otherwise, we check that the maximum number of per task * breakpoints (for this cpu) plus the number of per cpu breakpoint * (for this cpu) doesn't cover every registers. * * - If attached to every cpus, check: * * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks * breakpoints. * * * == Pinned counter == * * - If attached to a single cpu, check: * * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int max_pinned_slots; int weight; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) return -ENOMEM; /* Basic checks */ if (bp_type == HW_BREAKPOINT_EMPTY || bp_type == HW_BREAKPOINT_INVALID) return -EINVAL; type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); /* Check if this new breakpoint can be satisfied across all CPUs. */ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { struct mutex *mtx = bp_constraints_lock(bp); int ret = __reserve_bp_slot(bp, bp->attr.bp_type); bp_constraints_unlock(mtx); return ret; } static void __release_bp_slot(struct perf_event *bp, u64 bp_type) { enum bp_type_idx type; int weight; type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { struct mutex *mtx = bp_constraints_lock(bp); __release_bp_slot(bp, bp->attr.bp_type); bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case * there's no space for the new type. * * This must succeed, because we just released * the old_type slot in the __release_bp_slot * call above. If not, something is broken. */ WARN_ON(__reserve_bp_slot(bp, old_type)); } return err; } static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { struct mutex *mtx = bp_constraints_lock(bp); int ret = __modify_bp_slot(bp, old_type, new_type); bp_constraints_unlock(mtx); return ret; } /* * Allow the kernel debugger to reserve breakpoint slots without * taking a lock using the dbg_* variant of for the reserve and * release breakpoint slots. */ int dbg_reserve_bp_slot(struct perf_event *bp) { int ret; if (bp_constraints_is_locked(bp)) return -1; /* Locks aren't held; disable lockdep assert checking. */ lockdep_off(); ret = __reserve_bp_slot(bp, bp->attr.bp_type); lockdep_on(); return ret; } int dbg_release_bp_slot(struct perf_event *bp) { if (bp_constraints_is_locked(bp)) return -1; /* Locks aren't held; disable lockdep assert checking. */ lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); lockdep_on(); return 0; } static int hw_breakpoint_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { int err; err = hw_breakpoint_arch_parse(bp, attr, hw); if (err) return err; if (arch_check_bp_in_kernelspace(hw)) { if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap * path to avoid trap recursion attacks. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return 0; } int register_perf_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); if (err) return err; err = hw_breakpoint_parse(bp, &bp->attr, &hw); if (err) { release_bp_slot(bp); return err; } bp->hw.info = hw; return 0; } /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk, triggered, context); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); static void hw_breakpoint_copy_attr(struct perf_event_attr *to, struct perf_event_attr *from) { to->bp_addr = from->bp_addr; to->bp_type = from->bp_type; to->bp_len = from->bp_len; to->disabled = from->disabled; } int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); if (err) return err; if (check) { struct perf_event_attr old_attr; old_attr = bp->attr; hw_breakpoint_copy_attr(&old_attr, attr); if (memcmp(&old_attr, attr, sizeof(*attr))) return -EINVAL; } if (bp->attr.bp_type != attr->bp_type) { err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); if (err) return err; } hw_breakpoint_copy_attr(&bp->attr, attr); bp->hw.info = hw; return 0; } /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify * @attr: new breakpoint attributes */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. * So call the function directly after making sure we are targeting the * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) perf_event_disable_local(bp); else perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, false); if (!bp->attr.disabled) perf_event_enable(bp); return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); /** * unregister_hw_breakpoint - unregister a user-space hardware breakpoint * @bp: the breakpoint structure to unregister */ void unregister_hw_breakpoint(struct perf_event *bp) { if (!bp) return; perf_event_release_kernel(bp); } EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); /** * register_wide_hw_breakpoint - register a wide breakpoint in the kernel * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @context: context data could be used in the triggered callback * * @return a set of per_cpu pointers to perf events */ struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { struct perf_event * __percpu *cpu_events, *bp; long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) return ERR_PTR_PCPU(-ENOMEM); cpus_read_lock(); for_each_online_cpu(cpu) { bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); if (IS_ERR(bp)) { err = PTR_ERR(bp); break; } per_cpu(*cpu_events, cpu) = bp; } cpus_read_unlock(); if (likely(!err)) return cpu_events; unregister_wide_hw_breakpoint(cpu_events); return ERR_PTR_PCPU(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); /** * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; for_each_possible_cpu(cpu) unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); /** * hw_breakpoint_is_used - check if breakpoints are currently used * * Returns: true if breakpoints are used, false otherwise. */ bool hw_breakpoint_is_used(void) { int cpu; if (!constraints_initialized) return false; for_each_possible_cpu(cpu) { for (int type = 0; type < TYPE_MAX; ++type) { struct bp_cpuinfo *info = get_bp_info(cpu, type); if (info->cpu_pinned) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (atomic_read(&info->tsk_pinned.count[slot])) return true; } } } for (int type = 0; type < TYPE_MAX; ++type) { for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { /* * Warn, because if there are CPU pinned counters, * should never get here; bp_cpuinfo::cpu_pinned should * be consistent with the global cpu_pinned histogram. */ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) return true; if (atomic_read(&tsk_pinned_all[type].count[slot])) return true; } } return false; } static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ .priority = 0x7fffffff }; static void bp_perf_event_destroy(struct perf_event *event) { release_bp_slot(event); } static int hw_breakpoint_event_init(struct perf_event *bp) { int err; if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; /* * Check if breakpoint type is supported before proceeding. * Also, no branch sampling for breakpoint events. */ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); if (err) return err; bp->destroy = bp_perf_event_destroy; return 0; } static int hw_breakpoint_add(struct perf_event *bp, int flags) { if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; if (is_sampling_event(bp)) { bp->hw.last_period = bp->hw.sample_period; perf_swevent_set_period(bp); } return arch_install_hw_breakpoint(bp); } static void hw_breakpoint_del(struct perf_event *bp, int flags) { arch_uninstall_hw_breakpoint(bp); } static void hw_breakpoint_start(struct perf_event *bp, int flags) { bp->hw.state = 0; } static void hw_breakpoint_stop(struct perf_event *bp, int flags) { bp->hw.state = PERF_HES_STOPPED; } static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ .event_init = hw_breakpoint_event_init, .add = hw_breakpoint_add, .del = hw_breakpoint_del, .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, }; int __init init_hw_breakpoint(void) { int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) return ret; ret = init_breakpoint_slots(); if (ret) return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); }
13 34 56 63 62 52 1 63 52 7 63 52 62 8 302 4 303 327 304 166 5 7 8 166 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0 #include <linux/tcp.h> #include <net/tcp.h> static u32 tcp_rack_reo_wnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery) return 0; if (tp->sacked_out >= tp->reordering && !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_NO_DUPTHRESH)) return 0; } /* To be more reordering resilient, allow min_rtt/4 settling delay. * Use min_rtt instead of the smoothed RTT because reordering is * often a path property and less related to queuing or delayed ACKs. * Upon receiving DSACKs, linearly increase the window up to the * smoothed RTT. */ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps, tp->srtt_us >> 3); } s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): * * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * * dupthresh: 3 OOO packets delivered (packet count) * FACK: sequence delta to highest sacked sequence (sequence space) * RACK: sent time delta to the latest delivered packet (time domain) * * The advantage of RACK is it applies to both original and retransmitted * packet and therefore is robust against tail losses. Another advantage * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * * When tcp_rack_detect_loss() detects some packets are lost and we * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; reo_wnd = tcp_rack_reo_wnd(sk); list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); s32 remaining; /* Skip ones marked lost but not yet retransmitted */ if ((scb->sacked & TCPCB_LOST) && !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; if (!tcp_skb_sent_after(tp->rack.mstamp, tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; /* A packet is lost if it has not been s/acked beyond * the recent RTT plus the reordering window. */ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd); if (remaining <= 0) { tcp_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { /* Record maximum wait time */ *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } bool tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; if (!tp->rack.advanced) return false; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } return !!timeout; } /* Record the most recently (re)sent time among the (s)acked packets * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time) { u32 rtt_us; rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. * * If the original is lost, there is no ambiguity. Otherwise * we assume the original can be delayed up to aRTT + min_rtt. * the aRTT term is bounded by the fast recovery or timeout, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ return; } tp->rack.advanced = 1; tp->rack.rtt_us = rtt_us; if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) { tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; } } /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout, prior_inflight; u32 lost = tp->lost; prior_inflight = tcp_packets_in_flight(tp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0); } tcp_xmit_retransmit_queue(sk); } if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * * If a DSACK is received that seems like it may have been due to reordering * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) * no. of successful recoveries (accounts for full DSACK-based loss * recovery undo). After that, reset it to default (min_rtt/4). * * At max, reo_wnd is incremented only once per rtt. So that the new * DSACK on which we are reacting, is due to the spurious retx (approx) * after the reo_wnd has been updated last time. * * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than * absolute value to account for change in rtt. */ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_STATIC_REO_WND) || !rs->prior_delivered) return; /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ if (before(rs->prior_delivered, tp->rack.last_delivered)) tp->rack.dsack_seen = 0; /* Adjust the reo_wnd if update is pending */ if (tp->rack.dsack_seen) { tp->rack.reo_wnd_steps = min_t(u32, 0xFF, tp->rack.reo_wnd_steps + 1); tp->rack.dsack_seen = 0; tp->rack.last_delivered = tp->delivered; tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; } else if (!tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_steps = 1; } } /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery * b) an ACK acknowledging new data during the fast recovery. */ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) { const u8 state = inet_csk(sk)->icsk_ca_state; struct tcp_sock *tp = tcp_sk(sk); if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) || (state == TCP_CA_Recovery && snd_una_advanced)) { struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 mss; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) return; mss = tcp_skb_mss(skb); if (tcp_skb_pcount(skb) > 1 && skb->len > mss) tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, mss, mss, GFP_ATOMIC); tcp_mark_skb_lost(sk, skb); } }
2775 2790 1842 17 17 1492 1493 902 543 937 872 73 853 67 923 470 470 102 102 102 102 101 912 2 159 102 101 102 102 102 102 102 102 57 57 38 57 1 4 57 14 14 14 14 57 44 36 28 24 27 67 30 3 33 1 28 61 5 5 1 3 8 45 45 4 46 53 61 9 4 1 5 886 828 888 286 3 5 26 721 47 202 49 3 170 61 291 269 228 83 16 15 1108 1111 1104 440 1 1 1110 1108 1110 1109 1 953 925 925 310 310 310 1111 812 27 837 839 837 16 694 2 692 64 64 1 1 679 1 6 692 672 2 673 824 6 6 6 6 808 14 670 359 820 8 815 689 329 5 1 1 1 2 2 2 2 2 2 816 18 11 11 1046 913 921 7 7 888 812 65 14 822 62 48 14 2 1 2 6 10 839 820 4 825 801 24 825 5 66 807 16 12 24 80 2468 1196 2474 2475 2473 2468 24 2462 2479 47 10 75 51 47 75 75 58 15 5 11 2 72 848 943 47 925 926 924 843 902 843 842 843 19 901 799 741 899 885 900 4 841 4 599 558 902 227 883 962 918 1108 1107 1 7 8 8 1108 4 690 1032 998 39 901 912 961 159 1107 1108 1111 1005 1108 1107 19 1109 962 2780 2782 2781 2769 236 2779 2772 2783 2779 2777 2784 41 2780 2770 231 2771 2777 2775 2772 36 2767 2723 2628 1145 1145 2629 2626 1033 38 2722 2727 2726 2627 2629 1146 1146 1148 72 123 1344 141 15 1344 1344 1344 1344 1344 1342 1344 141 1323 470 470 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free(struct net *net, struct fib6_node *fn) { kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail_rcu(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .dumpit = inet6_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef DRIVERS_PCI_H #define DRIVERS_PCI_H #include <linux/pci.h> struct pcie_tlp_log; /* Number of possible devfns: 0.0 to 1f.7 inclusive */ #define MAX_NR_DEVFNS 256 #define PCI_FIND_CAP_TTL 48 #define PCI_VSEC_ID_INTEL_TBT 0x1234 /* Thunderbolt */ #define PCIE_LINK_RETRAIN_TIMEOUT_MS 1000 /* * Power stable to PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PVPERL". */ #define PCIE_T_PVPERL_MS 100 /* * REFCLK stable before PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PERST-CLK". */ #define PCIE_T_PERST_CLK_US 100 /* * End of conventional reset (PERST# de-asserted) to first configuration * request (device able to respond with a "Request Retry Status" completion), * from PCIe r6.0, sec 6.6.1. */ #define PCIE_T_RRS_READY_MS 100 /* * PCIe r6.0, sec 5.3.3.2.1 <PME Synchronization> * Recommends 1ms to 10ms timeout to check L2 ready. */ #define PCIE_PME_TO_L2_TIMEOUT_US 10000 /* * PCIe r6.0, sec 6.6.1 <Conventional Reset> * * - "With a Downstream Port that does not support Link speeds greater * than 5.0 GT/s, software must wait a minimum of 100 ms following exit * from a Conventional Reset before sending a Configuration Request to * the device immediately below that Port." * * - "With a Downstream Port that supports Link speeds greater than * 5.0 GT/s, software must wait a minimum of 100 ms after Link training * completes before sending a Configuration Request to the device * immediately below that Port." */ #define PCIE_RESET_CONFIG_DEVICE_WAIT_MS 100 /* Message Routing (r[2:0]); PCIe r6.0, sec 2.2.8 */ #define PCIE_MSG_TYPE_R_RC 0 #define PCIE_MSG_TYPE_R_ADDR 1 #define PCIE_MSG_TYPE_R_ID 2 #define PCIE_MSG_TYPE_R_BC 3 #define PCIE_MSG_TYPE_R_LOCAL 4 #define PCIE_MSG_TYPE_R_GATHER 5 /* Power Management Messages; PCIe r6.0, sec 2.2.8.2 */ #define PCIE_MSG_CODE_PME_TURN_OFF 0x19 /* INTx Mechanism Messages; PCIe r6.0, sec 2.2.8.1 */ #define PCIE_MSG_CODE_ASSERT_INTA 0x20 #define PCIE_MSG_CODE_ASSERT_INTB 0x21 #define PCIE_MSG_CODE_ASSERT_INTC 0x22 #define PCIE_MSG_CODE_ASSERT_INTD 0x23 #define PCIE_MSG_CODE_DEASSERT_INTA 0x24 #define PCIE_MSG_CODE_DEASSERT_INTB 0x25 #define PCIE_MSG_CODE_DEASSERT_INTC 0x26 #define PCIE_MSG_CODE_DEASSERT_INTD 0x27 extern const unsigned char pcie_link_speed[]; extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); bool pcie_cap_has_lnkctl2(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); /* Functions internal to the PCI core code */ #ifdef CONFIG_DMI extern const struct attribute_group pci_dev_smbios_attr_group; #endif enum pci_mmap_api { PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */ PCI_MMAP_PROCFS /* mmap on /proc/bus/pci/<BDF> */ }; int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai, enum pci_mmap_api mmap_api); bool pci_reset_supported(struct pci_dev *dev); void pci_init_reset_methods(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); int __pci_reset_bus(struct pci_bus *bus); struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; u32 data[]; }; struct pci_cap_saved_state { struct hlist_node next; struct pci_cap_saved_data cap; }; void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state); void pci_refresh_power_state(struct pci_dev *dev); int pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_device_status(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); void pci_dev_adjust_pme(struct pci_dev *dev); void pci_dev_complete_resume(struct pci_dev *pci_dev); void pci_config_pm_runtime_get(struct pci_dev *dev); void pci_config_pm_runtime_put(struct pci_dev *dev); void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_msi_init(struct pci_dev *dev); void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type); static inline bool pci_bus_rrs_vendor_id(u32 l) { return (l & 0xffff) == PCI_VENDOR_ID_PCI_SIG; } static inline void pci_wakeup_event(struct pci_dev *dev) { /* Wait 100 ms before the system can be put into a sleep state. */ pm_wakeup_event(&dev->dev, 100); } /** * pci_bar_index_is_valid - Check whether a BAR index is within valid range * @bar: BAR index * * Protects against overflowing &struct pci_dev.resource array. * * Return: true for valid index, false otherwise. */ static inline bool pci_bar_index_is_valid(int bar) { if (bar >= 0 && bar < PCI_NUM_RESOURCES) return true; return false; } static inline bool pci_has_subordinate(struct pci_dev *pci_dev) { return !!(pci_dev->subordinate); } static inline bool pci_power_manageable(struct pci_dev *pci_dev) { /* * Currently we allow normal PCI devices and PCI bridges transition * into D3 if their bridge_d3 is set. */ return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } static inline bool pcie_downstream_port(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_DOWNSTREAM || type == PCI_EXP_TYPE_PCIE_BRIDGE; } void pci_vpd_init(struct pci_dev *dev); extern const struct attribute_group pci_dev_vpd_attr_group; /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); int pci_proc_detach_device(struct pci_dev *dev); int pci_proc_detach_bus(struct pci_bus *bus); #else static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; } #endif /* Functions for PCI Hotplug drivers to use */ int pci_hp_add_bridge(struct pci_dev *dev); #if defined(CONFIG_SYSFS) && defined(HAVE_PCI_LEGACY) void pci_create_legacy_files(struct pci_bus *bus); void pci_remove_legacy_files(struct pci_bus *bus); #else static inline void pci_create_legacy_files(struct pci_bus *bus) { } static inline void pci_remove_legacy_files(struct pci_bus *bus) { } #endif /* Lock for read/write access to pci device and bus lists */ extern struct rw_semaphore pci_bus_sem; extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); #else static inline void pci_no_msi(void) { } #endif void pci_realloc_get_opt(char *); static inline int pci_no_d1d2(struct pci_dev *dev) { unsigned int parent_dstates = 0; if (dev->bus->self) parent_dstates = dev->bus->self->no_d1d2; return (dev->no_d1d2 || parent_dstates); } #ifdef CONFIG_SYSFS int pci_create_sysfs_dev_files(struct pci_dev *pdev); void pci_remove_sysfs_dev_files(struct pci_dev *pdev); extern const struct attribute_group *pci_dev_groups[]; extern const struct attribute_group *pci_dev_attr_groups[]; extern const struct attribute_group *pcibus_groups[]; extern const struct attribute_group *pci_bus_groups[]; extern const struct attribute_group pci_doe_sysfs_group; #else static inline int pci_create_sysfs_dev_files(struct pci_dev *pdev) { return 0; } static inline void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { } #define pci_dev_groups NULL #define pci_dev_attr_groups NULL #define pcibus_groups NULL #define pci_bus_groups NULL #endif extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mmio_size; extern unsigned long pci_hotplug_mmio_pref_size; extern unsigned long pci_hotplug_bus_size; extern unsigned long pci_cardbus_io_size; extern unsigned long pci_cardbus_mem_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching * PCI device id structure * @id: single PCI device id structure to match * @dev: the PCI device structure to match against * * Returns the matching pci_device_id structure or %NULL if there is no match. */ static inline const struct pci_device_id * pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) { if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && (id->device == PCI_ANY_ID || id->device == dev->device) && (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && !((id->class ^ dev->class) & id->class_mask)) return id; return NULL; } /* PCI slot sysfs helper code */ #define to_pci_slot(s) container_of(s, struct pci_slot, kobj) extern struct kset *pci_slots_kset; struct pci_slot_attribute { struct attribute attr; ssize_t (*show)(struct pci_slot *, char *); ssize_t (*store)(struct pci_slot *, const char *, size_t); }; #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr) enum pci_bar_type { pci_bar_unknown, /* Standard PCI BAR probe */ pci_bar_io, /* An I/O port BAR */ pci_bar_mem32, /* A 32-bit memory BAR */ pci_bar_mem64, /* A 64-bit memory BAR */ }; struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_setup_device(struct pci_dev *dev); void __pci_size_stdbars(struct pci_dev *dev, int count, unsigned int pos, u32 *sizes); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg, u32 *sizes); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); const char *pci_resource_name(struct pci_dev *dev, unsigned int i); bool pci_resource_is_optional(const struct pci_dev *dev, int resno); /** * pci_resource_num - Reverse lookup resource number from device resources * @dev: PCI device * @res: Resource to lookup index for (MUST be a @dev's resource) * * Perform reverse lookup to determine the resource number for @res within * @dev resource array. NOTE: The caller is responsible for ensuring @res is * among @dev's resources! * * Returns: resource number. */ static inline int pci_resource_num(const struct pci_dev *dev, const struct resource *res) { int resno = res - &dev->resource[0]; /* Passing a resource that is not among dev's resources? */ WARN_ON_ONCE(resno >= PCI_NUM_RESOURCES); return resno; } void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); #define PCIE_LNKCAP_SLS2SPEED(lnkcap) \ ({ \ ((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN); \ }) /* PCIe link information from Link Capabilities 2 */ #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \ ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) #define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \ ((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) /* PCIe speed to Mb/s reduced by encoding overhead */ #define PCIE_SPEED2MBS_ENC(speed) \ ((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \ (speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \ (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \ (speed) == PCIE_SPEED_8_0GT ? 8000*128/130 : \ (speed) == PCIE_SPEED_5_0GT ? 5000*8/10 : \ (speed) == PCIE_SPEED_2_5GT ? 2500*8/10 : \ 0) static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed) { switch (speed) { case PCIE_SPEED_2_5GT: return 2500; case PCIE_SPEED_5_0GT: return 5000; case PCIE_SPEED_8_0GT: return 8000; case PCIE_SPEED_16_0GT: return 16000; case PCIE_SPEED_32_0GT: return 32000; case PCIE_SPEED_64_0GT: return 64000; default: break; } return -EINVAL; } u8 pcie_get_supported_speeds(struct pci_dev *dev); const char *pci_speed_string(enum pci_bus_speed speed); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); static inline void __pcie_update_link_speed(struct pci_bus *bus, u16 linksta, u16 linksta2) { bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS]; bus->flit_mode = (linksta2 & PCI_EXP_LNKSTA2_FLIT) ? 1 : 0; } void pcie_update_link_speed(struct pci_bus *bus); /* Single Root I/O Virtualization */ struct pci_sriov { int pos; /* Capability position */ int nres; /* Number of resources */ u32 cap; /* SR-IOV Capabilities */ u16 ctrl; /* SR-IOV Control */ u16 total_VFs; /* Total VFs associated with the PF */ u16 initial_VFs; /* Initial VFs associated with the PF */ u16 num_VFs; /* Number of VFs available */ u16 offset; /* First VF Routing ID offset */ u16 stride; /* Following VF stride */ u16 vf_device; /* VF device ID */ u32 pgsz; /* Page size for BAR alignment */ u8 link; /* Function Dependency Link */ u8 max_VF_buses; /* Max buses consumed by VFs */ u16 driver_max_VFs; /* Max num VFs driver supports */ struct pci_dev *dev; /* Lowest numbered PF */ struct pci_dev *self; /* This PF */ u32 class; /* VF device */ u8 hdr_type; /* VF header type */ u16 subsystem_vendor; /* VF subsystem vendor */ u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; #ifdef CONFIG_PCI_DOE void pci_doe_init(struct pci_dev *pdev); void pci_doe_destroy(struct pci_dev *pdev); void pci_doe_disconnected(struct pci_dev *pdev); #else static inline void pci_doe_init(struct pci_dev *pdev) { } static inline void pci_doe_destroy(struct pci_dev *pdev) { } static inline void pci_doe_disconnected(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_NPEM void pci_npem_create(struct pci_dev *dev); void pci_npem_remove(struct pci_dev *dev); #else static inline void pci_npem_create(struct pci_dev *dev) { } static inline void pci_npem_remove(struct pci_dev *dev) { } #endif #if defined(CONFIG_PCI_DOE) && defined(CONFIG_SYSFS) void pci_doe_sysfs_init(struct pci_dev *pci_dev); void pci_doe_sysfs_teardown(struct pci_dev *pdev); #else static inline void pci_doe_sysfs_init(struct pci_dev *pdev) { } static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #endif /** * pci_dev_set_io_state - Set the new error state if possible. * * @dev: PCI device to set new error_state * @new: the state we want dev to be in * * If the device is experiencing perm_failure, it has to remain in that state. * Any other transition is allowed. * * Returns true if state has been changed to the requested state. */ static inline bool pci_dev_set_io_state(struct pci_dev *dev, pci_channel_state_t new) { pci_channel_state_t old; switch (new) { case pci_channel_io_perm_failure: xchg(&dev->error_state, pci_channel_io_perm_failure); return true; case pci_channel_io_frozen: old = cmpxchg(&dev->error_state, pci_channel_io_normal, pci_channel_io_frozen); return old != pci_channel_io_perm_failure; case pci_channel_io_normal: old = cmpxchg(&dev->error_state, pci_channel_io_frozen, pci_channel_io_normal); return old != pci_channel_io_perm_failure; default: return false; } } static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { pci_dev_set_io_state(dev, pci_channel_io_perm_failure); pci_doe_disconnected(dev); return 0; } /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 #define PCI_DPC_RECOVERED 1 #define PCI_DPC_RECOVERING 2 #define PCI_DEV_REMOVED 3 static inline void pci_dev_assign_added(struct pci_dev *dev) { smp_mb__before_atomic(); set_bit(PCI_DEV_ADDED, &dev->priv_flags); smp_mb__after_atomic(); } static inline bool pci_dev_test_and_clear_added(struct pci_dev *dev) { return test_and_clear_bit(PCI_DEV_ADDED, &dev->priv_flags); } static inline bool pci_dev_is_added(const struct pci_dev *dev) { return test_bit(PCI_DEV_ADDED, &dev->priv_flags); } static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) { return test_and_set_bit(PCI_DEV_REMOVED, &dev->priv_flags); } #ifdef CONFIG_PCIEAER #include <linux/aer.h> #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ unsigned int __pad1:5; unsigned int multi_error_valid:1; unsigned int first_error:5; unsigned int __pad2:2; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ struct pcie_tlp_log tlp; /* TLP Header */ }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, unsigned int tlp_len, bool flit, struct pcie_tlp_log *log); unsigned int aer_tlp_log_len(struct pci_dev *dev, u32 aercc); void pcie_print_tlp_log(const struct pci_dev *dev, const struct pcie_tlp_log *log, const char *pfx); #endif /* CONFIG_PCIEAER */ #ifdef CONFIG_PCIEPORTBUS /* Cached RCEC Endpoint Association */ struct rcec_ea { u8 nextbusn; u8 lastbusn; u32 bitmap; }; #endif #ifdef CONFIG_PCIE_DPC void pci_save_dpc_state(struct pci_dev *dev); void pci_restore_dpc_state(struct pci_dev *dev); void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); bool pci_dpc_recovered(struct pci_dev *pdev); unsigned int dpc_tlp_log_len(struct pci_dev *dev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) { } static inline void pci_restore_dpc_state(struct pci_dev *dev) { } static inline void pci_dpc_init(struct pci_dev *pdev) { } static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS void pci_rcec_init(struct pci_dev *dev); void pci_rcec_exit(struct pci_dev *dev); void pcie_link_rcec(struct pci_dev *rcec); void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata); #else static inline void pci_rcec_init(struct pci_dev *dev) { } static inline void pci_rcec_exit(struct pci_dev *dev) { } static inline void pcie_link_rcec(struct pci_dev *rcec) { } static inline void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata) { } #endif #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_PRI void pci_pri_init(struct pci_dev *dev); void pci_restore_pri_state(struct pci_dev *pdev); #else static inline void pci_pri_init(struct pci_dev *dev) { } static inline void pci_restore_pri_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_PASID void pci_pasid_init(struct pci_dev *dev); void pci_restore_pasid_state(struct pci_dev *pdev); #else static inline void pci_pasid_init(struct pci_dev *dev) { } static inline void pci_restore_pasid_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_IOV int pci_iov_init(struct pci_dev *dev); void pci_iov_release(struct pci_dev *dev); void pci_iov_remove(struct pci_dev *dev); void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); static inline bool pci_resource_is_iov(int resno) { return resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END; } extern const struct attribute_group sriov_pf_dev_attr_group; extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { return -ENODEV; } static inline void pci_iov_release(struct pci_dev *dev) { } static inline void pci_iov_remove(struct pci_dev *dev) { } static inline void pci_iov_update_resource(struct pci_dev *dev, int resno) { } static inline resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno) { return 0; } static inline void pci_restore_iov_state(struct pci_dev *dev) { } static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } static inline bool pci_resource_is_iov(int resno) { return false; } #endif /* CONFIG_PCI_IOV */ #ifdef CONFIG_PCIE_TPH void pci_restore_tph_state(struct pci_dev *dev); void pci_save_tph_state(struct pci_dev *dev); void pci_no_tph(void); void pci_tph_init(struct pci_dev *dev); #else static inline void pci_restore_tph_state(struct pci_dev *dev) { } static inline void pci_save_tph_state(struct pci_dev *dev) { } static inline void pci_no_tph(void) { } static inline void pci_tph_init(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); void pci_suspend_ptm(struct pci_dev *dev); void pci_resume_ptm(struct pci_dev *dev); #else static inline void pci_ptm_init(struct pci_dev *dev) { } static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } static inline void pci_suspend_ptm(struct pci_dev *dev) { } static inline void pci_resume_ptm(struct pci_dev *dev) { } #endif unsigned long pci_cardbus_resource_alignment(struct resource *); static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, struct resource *res) { int resno = pci_resource_num(dev, res); if (pci_resource_is_iov(resno)) return pci_sriov_resource_alignment(dev, resno); if (dev->class >> 8 == PCI_CLASS_BRIDGE_CARDBUS) return pci_cardbus_resource_alignment(res); return resource_alignment(res); } void pci_acs_init(struct pci_dev *dev); #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags); int pci_dev_specific_enable_acs(struct pci_dev *dev); int pci_dev_specific_disable_acs_redir(struct pci_dev *dev); int pcie_failed_link_retrain(struct pci_dev *dev); #else static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags) { return -ENOTTY; } static inline int pci_dev_specific_enable_acs(struct pci_dev *dev) { return -ENOTTY; } static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) { return -ENOTTY; } static inline int pcie_failed_link_retrain(struct pci_dev *dev) { return -ENOTTY; } #endif /* PCI error reporting and recovery */ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pci_channel_state_t state, pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); /* ASPM-related functionality we need even without CONFIG_PCIEASPM */ void pci_save_ltr_state(struct pci_dev *dev); void pci_restore_ltr_state(struct pci_dev *dev); void pci_configure_aspm_l1ss(struct pci_dev *dev); void pci_save_aspm_l1ss_state(struct pci_dev *dev); void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); void pci_configure_ltr(struct pci_dev *pdev); void pci_bridge_reconfigure_ltr(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } static inline void pci_configure_ltr(struct pci_dev *pdev) { } static inline void pci_bridge_reconfigure_ltr(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIE_ECRC void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); #else static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } static inline void pcie_ecrc_get_policy(char *str) { } #endif #ifdef CONFIG_PCIEPORTBUS void pcie_reset_lbms_count(struct pci_dev *port); int pcie_lbms_count(struct pci_dev *port, unsigned long *val); #else static inline void pcie_reset_lbms_count(struct pci_dev *port) {} static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val) { return -EOPNOTSUPP; } #endif struct pci_dev_reset_methods { u16 vendor; u16 device; int (*reset)(struct pci_dev *dev, bool probe); }; struct pci_reset_fn_method { int (*reset_fn)(struct pci_dev *pdev, bool probe); char *name; }; extern const struct pci_reset_fn_method pci_reset_fn_methods[]; #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_reset(struct pci_dev *dev, bool probe); #else static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } #endif #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64) int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res); #else static inline int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res) { return -ENODEV; } #endif void pci_rebar_init(struct pci_dev *pdev); int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); static inline u64 pci_rebar_size_to_bytes(int size) { return 1ULL << (size + 20); } struct device_node; #ifdef CONFIG_OF int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale); bool of_pci_preserve_config(struct device_node *node); int pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge); bool of_pci_supply_present(struct device_node *np); #else static inline int of_get_pci_domain_nr(struct device_node *node) { return -1; } static inline int of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } static inline u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale) { if (slot_power_limit_value) *slot_power_limit_value = 0; if (slot_power_limit_scale) *slot_power_limit_scale = 0; return 0; } static inline bool of_pci_preserve_config(struct device_node *node) { return false; } static inline int pci_set_of_node(struct pci_dev *dev) { return 0; } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge) { return 0; } static inline bool of_pci_supply_present(struct device_node *np) { return false; } #endif /* CONFIG_OF */ struct of_changeset; #ifdef CONFIG_PCI_DYNAMIC_OF_NODES void of_pci_make_dev_node(struct pci_dev *pdev); void of_pci_remove_node(struct pci_dev *pdev); int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs, struct device_node *np); void of_pci_make_host_bridge_node(struct pci_host_bridge *bridge); void of_pci_remove_host_bridge_node(struct pci_host_bridge *bridge); int of_pci_add_host_bridge_properties(struct pci_host_bridge *bridge, struct of_changeset *ocs, struct device_node *np); #else static inline void of_pci_make_dev_node(struct pci_dev *pdev) { } static inline void of_pci_remove_node(struct pci_dev *pdev) { } static inline void of_pci_make_host_bridge_node(struct pci_host_bridge *bridge) { } static inline void of_pci_remove_host_bridge_node(struct pci_host_bridge *bridge) { } #endif #ifdef CONFIG_PCIEAER void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); void pci_save_aer_state(struct pci_dev *dev); void pci_restore_aer_state(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline void pci_aer_init(struct pci_dev *d) { } static inline void pci_aer_exit(struct pci_dev *d) { } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline void pci_save_aer_state(struct pci_dev *dev) { } static inline void pci_restore_aer_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_ACPI bool pci_acpi_preserve_config(struct pci_host_bridge *bridge); int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); bool acpi_pci_power_manageable(struct pci_dev *dev); bool acpi_pci_bridge_d3(struct pci_dev *dev); int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t acpi_pci_get_power_state(struct pci_dev *dev); void acpi_pci_refresh_power_state(struct pci_dev *dev); int acpi_pci_wakeup(struct pci_dev *dev, bool enable); bool acpi_pci_need_resume(struct pci_dev *dev); pci_power_t acpi_pci_choose_state(struct pci_dev *pdev); #else static inline bool pci_acpi_preserve_config(struct pci_host_bridge *bridge) { return false; } static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } static inline void pci_set_acpi_fwnode(struct pci_dev *dev) { } static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } static inline bool acpi_pci_power_manageable(struct pci_dev *dev) { return false; } static inline bool acpi_pci_bridge_d3(struct pci_dev *dev) { return false; } static inline int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return -ENODEV; } static inline pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) { return PCI_UNKNOWN; } static inline void acpi_pci_refresh_power_state(struct pci_dev *dev) { } static inline int acpi_pci_wakeup(struct pci_dev *dev, bool enable) { return -ENODEV; } static inline bool acpi_pci_need_resume(struct pci_dev *dev) { return false; } static inline pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) { return PCI_POWER_ERROR; } #endif #ifdef CONFIG_PCIEASPM extern const struct attribute_group aspm_ctrl_attr_group; #endif #ifdef CONFIG_X86_INTEL_MID bool pci_use_mid_pm(void); int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); pci_power_t mid_pci_get_power_state(struct pci_dev *pdev); #else static inline bool pci_use_mid_pm(void) { return false; } static inline int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) { return -ENODEV; } static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) { return PCI_UNKNOWN; } #endif int pcim_intx(struct pci_dev *dev, int enable); int pcim_request_region_exclusive(struct pci_dev *pdev, int bar, const char *name); void pcim_release_region(struct pci_dev *pdev, int bar); /* * Config Address for PCI Configuration Mechanism #1 * * See PCI Local Bus Specification, Revision 3.0, * Section 3.2.2.3.2, Figure 3-2, p. 50. */ #define PCI_CONF1_BUS_SHIFT 16 /* Bus number */ #define PCI_CONF1_DEV_SHIFT 11 /* Device number */ #define PCI_CONF1_FUNC_SHIFT 8 /* Function number */ #define PCI_CONF1_BUS_MASK 0xff #define PCI_CONF1_DEV_MASK 0x1f #define PCI_CONF1_FUNC_MASK 0x7 #define PCI_CONF1_REG_MASK 0xfc /* Limit aligned offset to a maximum of 256B */ #define PCI_CONF1_ENABLE BIT(31) #define PCI_CONF1_BUS(x) (((x) & PCI_CONF1_BUS_MASK) << PCI_CONF1_BUS_SHIFT) #define PCI_CONF1_DEV(x) (((x) & PCI_CONF1_DEV_MASK) << PCI_CONF1_DEV_SHIFT) #define PCI_CONF1_FUNC(x) (((x) & PCI_CONF1_FUNC_MASK) << PCI_CONF1_FUNC_SHIFT) #define PCI_CONF1_REG(x) ((x) & PCI_CONF1_REG_MASK) #define PCI_CONF1_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ENABLE | \ PCI_CONF1_BUS(bus) | \ PCI_CONF1_DEV(dev) | \ PCI_CONF1_FUNC(func) | \ PCI_CONF1_REG(reg)) /* * Extension of PCI Config Address for accessing extended PCIe registers * * No standardized specification, but used on lot of non-ECAM-compliant ARM SoCs * or on AMD Barcelona and new CPUs. Reserved bits [27:24] of PCI Config Address * are used for specifying additional 4 high bits of PCI Express register. */ #define PCI_CONF1_EXT_REG_SHIFT 16 #define PCI_CONF1_EXT_REG_MASK 0xf00 #define PCI_CONF1_EXT_REG(x) (((x) & PCI_CONF1_EXT_REG_MASK) << PCI_CONF1_EXT_REG_SHIFT) #define PCI_CONF1_EXT_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ADDRESS(bus, dev, func, reg) | \ PCI_CONF1_EXT_REG(reg)) #endif /* DRIVERS_PCI_H */
60 1 4 4 56 56 56 77 77 76 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ #include "network-coding.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_packet.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "tvlv.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; static void batadv_nc_worker(struct work_struct *work); static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); /** * batadv_nc_init() - one-time initialization for network coding * * Return: 0 on success or negative error number in case of failure */ int __init batadv_nc_init(void) { /* Register our packet type */ return batadv_recv_handler_register(BATADV_CODED, batadv_nc_recv_coded_packet); } /** * batadv_nc_start_timer() - initialise the nc periodic worker * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, msecs_to_jiffies(10)); } /** * batadv_nc_tvlv_container_update() - update the network coding tvlv container * after network coding setting change * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) { char nc_mode; nc_mode = atomic_read(&bat_priv->network_coding); switch (nc_mode) { case 0: batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); break; case 1: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1, NULL, 0); break; } } /** * batadv_nc_status_update() - update the network coding tvlv container after * network coding setting change * @net_dev: the mesh interface net device */ void batadv_nc_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); batadv_nc_tvlv_container_update(bat_priv); } /** * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); else set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); } /** * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) { bat_priv->nc.timestamp_fwd_flush = jiffies; bat_priv->nc.timestamp_sniffed_purge = jiffies; if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) return 0; bat_priv->nc.coding_hash = batadv_hash_new(128); if (!bat_priv->nc.coding_hash) goto err; batadv_hash_set_lock_class(bat_priv->nc.coding_hash, &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); if (!bat_priv->nc.decoding_hash) { batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); batadv_nc_start_timer(bat_priv); batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, NULL, NULL, BATADV_TVLV_NC, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_nc_tvlv_container_update(bat_priv); return 0; err: return -ENOMEM; } /** * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables * @bat_priv: the bat priv with all the mesh interface information */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->network_coding, 0); bat_priv->nc.min_tq = 200; bat_priv->nc.max_fwd_delay = 10; bat_priv->nc.max_buffer_time = 200; } /** * batadv_nc_init_orig() - initialise the nc fields of an orig_node * @orig_node: the orig_node which is going to be initialised */ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) { INIT_LIST_HEAD(&orig_node->in_coding_list); INIT_LIST_HEAD(&orig_node->out_coding_list); spin_lock_init(&orig_node->in_coding_list_lock); spin_lock_init(&orig_node->out_coding_list_lock); } /** * batadv_nc_node_release() - release nc_node from lists and queue for free * after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_nc_node_release(struct kref *ref) { struct batadv_nc_node *nc_node; nc_node = container_of(ref, struct batadv_nc_node, refcount); batadv_orig_node_put(nc_node->orig_node); kfree_rcu(nc_node, rcu); } /** * batadv_nc_node_put() - decrement the nc_node refcounter and possibly * release it * @nc_node: nc_node to be free'd */ static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { if (!nc_node) return; kref_put(&nc_node->refcount, batadv_nc_node_release); } /** * batadv_nc_path_release() - release nc_path from lists and queue for free * after rcu grace period * @ref: kref pointer of the nc_path */ static void batadv_nc_path_release(struct kref *ref) { struct batadv_nc_path *nc_path; nc_path = container_of(ref, struct batadv_nc_path, refcount); kfree_rcu(nc_path, rcu); } /** * batadv_nc_path_put() - decrement the nc_path refcounter and possibly * release it * @nc_path: nc_path to be free'd */ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { if (!nc_path) return; kref_put(&nc_path->refcount, batadv_nc_path_release); } /** * batadv_nc_packet_free() - frees nc packet * @nc_packet: the nc packet to free * @dropped: whether the packet is freed because is dropped */ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, bool dropped) { if (dropped) kfree_skb(nc_packet->skb); else consume_skb(nc_packet->skb); batadv_nc_path_put(nc_packet->nc_path); kfree(nc_packet); } /** * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged * @bat_priv: the bat priv with all the mesh interface information * @nc_node: the nc node to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, struct batadv_nc_node *nc_node) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); } /** * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; /* purge the path when no packets has been added for 10 times the * max_fwd_delay time */ return batadv_has_timed_out(nc_path->last_valid, bat_priv->nc.max_fwd_delay * 10); } /** * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed * out * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) { if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return true; /* purge the path when no packets has been added for 10 times the * max_buffer time */ return batadv_has_timed_out(nc_path->last_valid, bat_priv->nc.max_buffer_time * 10); } /** * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale * entries * @bat_priv: the bat priv with all the mesh interface information * @list: list of nc nodes * @lock: nc node list lock * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true if the entry has to be deleted, false * otherwise */ static void batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, struct list_head *list, spinlock_t *lock, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_node *)) { struct batadv_nc_node *nc_node, *nc_node_tmp; /* For each nc_node in list */ spin_lock_bh(lock); list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { /* if an helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(bat_priv, nc_node)) continue; batadv_dbg(BATADV_DBG_NC, bat_priv, "Removing nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); list_del_rcu(&nc_node->list); batadv_nc_node_put(nc_node); } spin_unlock_bh(lock); } /** * batadv_nc_purge_orig() - purges all nc node data attached of the given * originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig_node with the nc node entries to be purged * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true is the entry has to be deleted, false * otherwise */ void batadv_nc_purge_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_node *)) { /* Check ingoing nc_node's of this orig_node */ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, &orig_node->in_coding_list_lock, to_purge); /* Check outgoing nc_node's of this orig_node */ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, &orig_node->out_coding_list_lock, to_purge); } /** * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if * they have timed out nc nodes * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; u32 i; if (!hash) return; /* For each orig_node */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) batadv_nc_purge_orig(bat_priv, orig_node, batadv_nc_to_purge_nc_node); rcu_read_unlock(); } } /** * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove * unused ones * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the nc paths to check * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return * a boolean value: true is the entry has to be deleted, false * otherwise */ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, bool (*to_purge)(struct batadv_priv *, struct batadv_nc_path *)) { struct hlist_head *head; struct hlist_node *node_tmp; struct batadv_nc_path *nc_path; spinlock_t *lock; /* Protects lists in hash */ u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; lock = &hash->list_locks[i]; /* For each nc_path in this bin */ spin_lock_bh(lock); hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { /* if an helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(bat_priv, nc_path)) continue; /* purging an non-empty nc_path should never happen, but * is observed under high CPU load. Delay the purging * until next iteration to allow the packet_list to be * emptied first. */ if (!unlikely(list_empty(&nc_path->packet_list))) { net_ratelimited_function(printk, KERN_WARNING "Skipping free of non-empty nc_path (%pM -> %pM)!\n", nc_path->prev_hop, nc_path->next_hop); continue; } /* nc_path is unused, so remove it */ batadv_dbg(BATADV_DBG_NC, bat_priv, "Remove nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); hlist_del_rcu(&nc_path->hash_entry); batadv_nc_path_put(nc_path); } spin_unlock_bh(lock); } } /** * batadv_nc_hash_key_gen() - computes the nc_path hash key * @key: buffer to hold the final hash key * @src: source ethernet mac address going into the hash key * @dst: destination ethernet mac address going into the hash key */ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, const char *dst) { memcpy(key->prev_hop, src, sizeof(key->prev_hop)); memcpy(key->next_hop, dst, sizeof(key->next_hop)); } /** * batadv_nc_hash_choose() - compute the hash value for an nc path * @data: data to hash * @size: size of the hash table * * Return: the selected index in the hash table for the given data. */ static u32 batadv_nc_hash_choose(const void *data, u32 size) { const struct batadv_nc_path *nc_path = data; u32 hash = 0; hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); return hash % size; } /** * batadv_nc_hash_compare() - comparing function used in the network coding hash * tables * @node: node in the local table * @data2: second object to compare the node to * * Return: true if the two entry are the same, false otherwise */ static bool batadv_nc_hash_compare(const struct hlist_node *node, const void *data2) { const struct batadv_nc_path *nc_path1, *nc_path2; nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); nc_path2 = data2; /* Return 1 if the two keys are identical */ if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) return false; if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) return false; return true; } /** * batadv_nc_hash_find() - search for an existing nc path and return it * @hash: hash table containing the nc path * @data: search key * * Return: the nc_path if found, NULL otherwise. */ static struct batadv_nc_path * batadv_nc_hash_find(struct batadv_hashtable *hash, void *data) { struct hlist_head *head; struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; int index; if (!hash) return NULL; index = batadv_nc_hash_choose(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, head, hash_entry) { if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) continue; if (!kref_get_unless_zero(&nc_path->refcount)) continue; nc_path_tmp = nc_path; break; } rcu_read_unlock(); return nc_path_tmp; } /** * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct * @nc_packet: the nc packet to send */ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) { batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet, false); } /** * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * * Checks whether the given sniffed (overheard) nc_packet has hit its buffering * timeout. If so, the packet is no longer kept and the entry deleted from the * queue. Has to be called with the appropriate locks. * * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path, struct batadv_nc_packet *nc_packet) { unsigned long timeout = bat_priv->nc.max_buffer_time; bool res = false; lockdep_assert_held(&nc_path->packet_list_lock); /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && !batadv_has_timed_out(nc_packet->timestamp, timeout)) goto out; /* purge nc packet */ list_del(&nc_packet->list); batadv_nc_packet_free(nc_packet, true); res = true; out: return res; } /** * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * * Checks whether the given nc packet has hit its forward timeout. If so, the * packet is no longer delayed, immediately sent and the entry deleted from the * queue. Has to be called with the appropriate locks. * * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path, struct batadv_nc_packet *nc_packet) { unsigned long timeout = bat_priv->nc.max_fwd_delay; lockdep_assert_held(&nc_path->packet_list_lock); /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && !batadv_has_timed_out(nc_packet->timestamp, timeout)) return false; /* Send packet */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, nc_packet->skb->len + ETH_HLEN); list_del(&nc_packet->list); batadv_nc_send_packet(nc_packet); return true; } /** * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed * out nc packets * @bat_priv: the bat priv with all the mesh interface information * @hash: to be processed hash table * @process_fn: Function called to process given nc packet. Should return true * to encourage this function to proceed with the next packet. * Otherwise the rest of the current queue is skipped. */ static void batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, bool (*process_fn)(struct batadv_priv *, struct batadv_nc_path *, struct batadv_nc_packet *)) { struct hlist_head *head; struct batadv_nc_packet *nc_packet, *nc_packet_tmp; struct batadv_nc_path *nc_path; bool ret; int i; if (!hash) return; /* Loop hash table bins */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; /* Loop coding paths */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, head, hash_entry) { /* Loop packets */ spin_lock_bh(&nc_path->packet_list_lock); list_for_each_entry_safe(nc_packet, nc_packet_tmp, &nc_path->packet_list, list) { ret = process_fn(bat_priv, nc_path, nc_packet); if (!ret) break; } spin_unlock_bh(&nc_path->packet_list_lock); } rcu_read_unlock(); } } /** * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ static void batadv_nc_worker(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_nc *priv_nc; struct batadv_priv *bat_priv; unsigned long timeout; delayed_work = to_delayed_work(work); priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); bat_priv = container_of(priv_nc, struct batadv_priv, nc); batadv_nc_purge_orig_hash(bat_priv); batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, batadv_nc_to_purge_nc_path_coding); batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, batadv_nc_to_purge_nc_path_decoding); timeout = bat_priv->nc.max_fwd_delay; if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, batadv_nc_fwd_flush); bat_priv->nc.timestamp_fwd_flush = jiffies; } if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, bat_priv->nc.max_buffer_time)) { batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, batadv_nc_sniffed_purge); bat_priv->nc.timestamp_sniffed_purge = jiffies; } /* Schedule a new check */ batadv_nc_start_timer(bat_priv); } /** * batadv_can_nc_with_orig() - checks whether the given orig node is suitable * for coding or not * @bat_priv: the bat priv with all the mesh interface information * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * * Return: true if: * 1) The OGM must have the most recent sequence number. * 2) The TTL must be decremented by one and only one. * 3) The OGM must be received from the first hop from orig_node. * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. */ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_ogm_packet *ogm_packet) { struct batadv_orig_ifinfo *orig_ifinfo; u32 last_real_seqno; u8 last_ttl; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); if (!orig_ifinfo) return false; last_ttl = orig_ifinfo->last_ttl; last_real_seqno = orig_ifinfo->last_real_seqno; batadv_orig_ifinfo_put(orig_ifinfo); if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; if (last_ttl != ogm_packet->ttl + 1) return false; if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) return false; if (ogm_packet->tq < bat_priv->nc.min_tq) return false; return true; } /** * batadv_nc_find_nc_node() - search for an existing nc node and return it * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * * Return: the nc_node if found, NULL otherwise. */ static struct batadv_nc_node * batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, bool in_coding) { struct batadv_nc_node *nc_node, *nc_node_out = NULL; struct list_head *list; if (in_coding) list = &orig_neigh_node->in_coding_list; else list = &orig_neigh_node->out_coding_list; /* Traverse list of nc_nodes to orig_node */ rcu_read_lock(); list_for_each_entry_rcu(nc_node, list, list) { if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) continue; if (!kref_get_unless_zero(&nc_node->refcount)) continue; /* Found a match */ nc_node_out = nc_node; break; } rcu_read_unlock(); return nc_node_out; } /** * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was * not found * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * * Return: the nc_node if found or created, NULL in case of an error. */ static struct batadv_nc_node * batadv_nc_get_nc_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, bool in_coding) { struct batadv_nc_node *nc_node; spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; /* Select ingoing or outgoing coding node */ if (in_coding) { lock = &orig_neigh_node->in_coding_list_lock; list = &orig_neigh_node->in_coding_list; } else { lock = &orig_neigh_node->out_coding_list_lock; list = &orig_neigh_node->out_coding_list; } spin_lock_bh(lock); /* Check if nc_node is already added */ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); /* Node found */ if (nc_node) goto unlock; nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); if (!nc_node) goto unlock; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); kref_init(&nc_node->refcount); ether_addr_copy(nc_node->addr, orig_node->orig); kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); unlock: spin_unlock_bh(lock); return nc_node; } /** * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node * structs (best called on incoming OGMs) * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) * @ogm_packet: incoming ogm packet * @is_single_hop_neigh: orig_node is a single hop neighbor */ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *ogm_packet, int is_single_hop_neigh) { struct batadv_nc_node *in_nc_node = NULL; struct batadv_nc_node *out_nc_node = NULL; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* check if orig node is network coding enabled */ if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) goto out; /* accept ogms from 'good' neighbors and single hop neighbors */ if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && !is_single_hop_neigh) goto out; /* Add orig_node as in_nc_node on hop */ in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, orig_neigh_node, true); if (!in_nc_node) goto out; in_nc_node->last_seen = jiffies; /* Add hop as out_nc_node on orig_node */ out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, orig_node, false); if (!out_nc_node) goto out; out_nc_node->last_seen = jiffies; out: batadv_nc_node_put(in_nc_node); batadv_nc_node_put(out_nc_node); } /** * batadv_nc_get_path() - get existing nc_path or allocate a new one * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the nc path * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key * * Return: pointer to nc_path if the path was found or created, returns NULL * on error. */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, u8 *src, u8 *dst) { int hash_added; struct batadv_nc_path *nc_path, nc_path_key; batadv_nc_hash_key_gen(&nc_path_key, src, dst); /* Search for existing nc_path */ nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); if (nc_path) { /* Set timestamp to delay removal of nc_path */ nc_path->last_valid = jiffies; return nc_path; } /* No existing nc_path was found; create a new */ nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); if (!nc_path) return NULL; /* Initialize nc_path */ INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); kref_init(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); /* Add nc_path to hash table */ kref_get(&nc_path->refcount); hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, batadv_nc_hash_choose, &nc_path_key, &nc_path->hash_entry); if (hash_added < 0) { kfree(nc_path); return NULL; } return nc_path; } /** * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value * * Return: scaled tq value */ static u8 batadv_nc_random_weight_tq(u8 tq) { /* randomize the estimated packet loss (max TQ - estimated TQ) */ u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); /* convert to (randomized) estimated tq again */ return BATADV_TQ_MAX_VALUE - rand_tq; } /** * batadv_nc_memxor() - XOR destination with source * @dst: byte array to XOR into * @src: byte array to XOR from * @len: length of destination array */ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) { unsigned int i; for (i = 0; i < len; ++i) dst[i] ^= src[i]; } /** * batadv_nc_code_packets() - code a received unicast_packet with an nc packet * into a coded_packet and send it * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to forward * @ethhdr: pointer to the ethernet header inside the skb * @nc_packet: structure containing the packet to the skb can be coded with * @neigh_node: next hop to forward packet to * * Return: true if both packets are consumed, false otherwise. */ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct sk_buff *skb, struct ethhdr *ethhdr, struct batadv_nc_packet *nc_packet, struct batadv_neigh_node *neigh_node) { u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; struct sk_buff *skb_dest, *skb_src; struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; struct batadv_coded_packet *coded_packet; struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; struct batadv_neigh_node *router_coding = NULL, *second_dest; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; u8 *first_source, *second_source; __be32 packet_id1, packet_id2; size_t count; bool res = false; int coding_len; int unicast_size = sizeof(*packet1); int coded_size = sizeof(*coded_packet); int header_add = coded_size - unicast_size; /* TODO: do we need to consider the outgoing interface for * coded packets? */ router_neigh = batadv_orig_router_get(neigh_node->orig_node, BATADV_IF_DEFAULT); if (!router_neigh) goto out; router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, BATADV_IF_DEFAULT); if (!router_neigh_ifinfo) goto out; neigh_tmp = nc_packet->neigh_node; router_coding = batadv_orig_router_get(neigh_tmp->orig_node, BATADV_IF_DEFAULT); if (!router_coding) goto out; router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, BATADV_IF_DEFAULT); if (!router_coding_ifinfo) goto out; tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); /* Select one destination for the MAC-header dst-field based on * weighted TQ-values. */ if (tq_weighted_neigh >= tq_weighted_coding) { /* Destination from nc_packet is selected for MAC-header */ first_dest = nc_packet->neigh_node; first_source = nc_packet->nc_path->prev_hop; second_dest = neigh_node; second_source = ethhdr->h_source; packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet2 = (struct batadv_unicast_packet *)skb->data; packet_id1 = nc_packet->packet_id; packet_id2 = batadv_skb_crc32(skb, skb->data + sizeof(*packet2)); } else { /* Destination for skb is selected for MAC-header */ first_dest = neigh_node; first_source = ethhdr->h_source; second_dest = nc_packet->neigh_node; second_source = nc_packet->nc_path->prev_hop; packet1 = (struct batadv_unicast_packet *)skb->data; packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; packet_id1 = batadv_skb_crc32(skb, skb->data + sizeof(*packet1)); packet_id2 = nc_packet->packet_id; } /* Instead of zero padding the smallest data buffer, we * code into the largest. */ if (skb->len <= nc_packet->skb->len) { skb_dest = nc_packet->skb; skb_src = skb; } else { skb_dest = skb; skb_src = nc_packet->skb; } /* coding_len is used when decoding the packet shorter packet */ coding_len = skb_src->len - unicast_size; if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) goto out; skb_push(skb_dest, header_add); coded_packet = (struct batadv_coded_packet *)skb_dest->data; skb_reset_mac_header(skb_dest); coded_packet->packet_type = BATADV_CODED; coded_packet->version = BATADV_COMPAT_VERSION; coded_packet->ttl = packet1->ttl; /* Info about first unicast packet */ ether_addr_copy(coded_packet->first_source, first_source); ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); coded_packet->first_crc = packet_id1; coded_packet->first_ttvn = packet1->ttvn; /* Info about second unicast packet */ ether_addr_copy(coded_packet->second_dest, second_dest->addr); ether_addr_copy(coded_packet->second_source, second_source); ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); coded_packet->second_crc = packet_id2; coded_packet->second_ttl = packet2->ttl; coded_packet->second_ttvn = packet2->ttvn; coded_packet->coded_len = htons(coding_len); /* This is where the magic happens: Code skb_src into skb_dest */ batadv_nc_memxor(skb_dest->data + coded_size, skb_src->data + unicast_size, coding_len); /* Update counters accordingly */ if (BATADV_SKB_CB(skb_src)->decoded && BATADV_SKB_CB(skb_dest)->decoded) { /* Both packets are recoded */ count = skb_src->len + ETH_HLEN; count += skb_dest->len + ETH_HLEN; batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); } else if (!BATADV_SKB_CB(skb_src)->decoded && !BATADV_SKB_CB(skb_dest)->decoded) { /* Both packets are newly coded */ count = skb_src->len + ETH_HLEN; count += skb_dest->len + ETH_HLEN; batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); } else if (BATADV_SKB_CB(skb_src)->decoded && !BATADV_SKB_CB(skb_dest)->decoded) { /* skb_src recoded and skb_dest is newly coded */ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, skb_src->len + ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, skb_dest->len + ETH_HLEN); } else if (!BATADV_SKB_CB(skb_src)->decoded && BATADV_SKB_CB(skb_dest)->decoded) { /* skb_src is newly coded and skb_dest is recoded */ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, skb_src->len + ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, skb_dest->len + ETH_HLEN); } /* skb_src is now coded into skb_dest, so free it */ consume_skb(skb_src); /* avoid duplicate free of skb from nc_packet */ nc_packet->skb = NULL; batadv_nc_packet_free(nc_packet, false); /* Send the coded packet and return true */ batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: batadv_neigh_node_put(router_neigh); batadv_neigh_node_put(router_coding); batadv_neigh_ifinfo_put(router_neigh_ifinfo); batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } /** * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. * @skb: data skb to forward * @dst: destination mac address of the other skb to code with * @src: source mac address of skb * * Whenever we network code a packet we have to check whether we received it in * a network coded form. If so, we may not be able to use it for coding because * some neighbors may also have received (overheard) the packet in the network * coded form without being able to decode it. It is hard to know which of the * neighboring nodes was able to decode the packet, therefore we can only * re-code the packet if the source of the previous encoded packet is involved. * Since the source encoded the packet we can be certain it has all necessary * decode information. * * Return: true if coding of a decoded packet is allowed. */ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) return false; return true; } /** * batadv_nc_path_search() - Find the coding path matching in_nc_node and * out_nc_node to retrieve a buffered packet that can be used for coding. * @bat_priv: the bat priv with all the mesh interface information * @in_nc_node: pointer to skb next hop's neighbor nc node * @out_nc_node: pointer to skb source's neighbor nc node * @skb: data skb to forward * @eth_dst: next hop mac address of skb * * Return: true if coding of a decoded skb is allowed. */ static struct batadv_nc_packet * batadv_nc_path_search(struct batadv_priv *bat_priv, struct batadv_nc_node *in_nc_node, struct batadv_nc_node *out_nc_node, struct sk_buff *skb, u8 *eth_dst) { struct batadv_nc_path *nc_path, nc_path_key; struct batadv_nc_packet *nc_packet_out = NULL; struct batadv_nc_packet *nc_packet, *nc_packet_tmp; struct batadv_hashtable *hash = bat_priv->nc.coding_hash; int idx; if (!hash) return NULL; /* Create almost path key */ batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, out_nc_node->addr); idx = batadv_nc_hash_choose(&nc_path_key, hash->size); /* Check for coding opportunities in this nc_path */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) continue; if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) continue; spin_lock_bh(&nc_path->packet_list_lock); if (list_empty(&nc_path->packet_list)) { spin_unlock_bh(&nc_path->packet_list_lock); continue; } list_for_each_entry_safe(nc_packet, nc_packet_tmp, &nc_path->packet_list, list) { if (!batadv_nc_skb_coding_possible(nc_packet->skb, eth_dst, in_nc_node->addr)) continue; /* Coding opportunity is found! */ list_del(&nc_packet->list); nc_packet_out = nc_packet; break; } spin_unlock_bh(&nc_path->packet_list_lock); break; } rcu_read_unlock(); return nc_packet_out; } /** * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to forward * @eth_dst: next hop mac address of skb * @eth_src: source mac address of skb * @in_nc_node: pointer to skb next hop's neighbor nc node * * Return: an nc packet if a suitable coding packet was found, NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *eth_dst, u8 *eth_src, struct batadv_nc_node *in_nc_node) { struct batadv_orig_node *orig_node; struct batadv_nc_node *out_nc_node; struct batadv_nc_packet *nc_packet = NULL; orig_node = batadv_orig_hash_find(bat_priv, eth_src); if (!orig_node) return NULL; rcu_read_lock(); list_for_each_entry_rcu(out_nc_node, &orig_node->out_coding_list, list) { /* Check if the skb is decoded and if recoding is possible */ if (!batadv_nc_skb_coding_possible(skb, out_nc_node->addr, eth_src)) continue; /* Search for an opportunity in this nc_path */ nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, out_nc_node, skb, eth_dst); if (nc_packet) break; } rcu_read_unlock(); batadv_orig_node_put(orig_node); return nc_packet; } /** * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the * unicast skb before it is stored for use in later decoding * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to store * @eth_dst_new: new destination mac address of skb */ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *eth_dst_new) { struct ethhdr *ethhdr; /* Copy skb header to change the mac header */ skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!skb) return; /* Set the mac header as if we actually sent the packet uncoded */ ethhdr = eth_hdr(skb); ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); ether_addr_copy(ethhdr->h_dest, eth_dst_new); /* Set data pointer to MAC header to mimic packets from our tx path */ skb_push(skb, ETH_HLEN); /* Add the packet to the decoding packet pool */ batadv_nc_skb_store_for_decoding(bat_priv, skb); /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free * our ref */ consume_skb(skb); } /** * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. * @skb: data skb to forward * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * * Loops through the list of neighboring nodes the next hop has a good * connection to (receives OGMs with a sufficient quality). We need to find a * neighbor of our next hop that potentially sent a packet which our next hop * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, struct ethhdr *ethhdr) { struct net_device *netdev = neigh_node->if_incoming->mesh_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_orig_node *orig_node = neigh_node->orig_node; struct batadv_nc_node *nc_node; struct batadv_nc_packet *nc_packet = NULL; rcu_read_lock(); list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { /* Search for coding opportunity with this in_nc_node */ nc_packet = batadv_nc_skb_src_search(bat_priv, skb, neigh_node->addr, ethhdr->h_source, nc_node); /* Opportunity was found, so stop searching */ if (nc_packet) break; } rcu_read_unlock(); if (!nc_packet) return false; /* Save packets for later decoding */ batadv_nc_skb_store_before_coding(bat_priv, skb, neigh_node->addr); batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, nc_packet->neigh_node->addr); /* Code and send packets */ if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, neigh_node)) return true; /* out of mem ? Coding failed - we have to free the buffered packet * to avoid memleaks. The skb passed as argument will be dealt with * by the calling function. */ batadv_nc_send_packet(nc_packet); return false; } /** * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding * @skb: skb to add to path * @nc_path: path to add skb to * @neigh_node: next hop to forward packet to * @packet_id: checksum to identify packet * * Return: true if the packet was buffered or false in case of an error. */ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, struct batadv_nc_path *nc_path, struct batadv_neigh_node *neigh_node, __be32 packet_id) { struct batadv_nc_packet *nc_packet; nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); if (!nc_packet) return false; /* Initialize nc_packet */ nc_packet->timestamp = jiffies; nc_packet->packet_id = packet_id; nc_packet->skb = skb; nc_packet->neigh_node = neigh_node; nc_packet->nc_path = nc_path; /* Add coding packet to list */ spin_lock_bh(&nc_path->packet_list_lock); list_add_tail(&nc_packet->list, &nc_path->packet_list); spin_unlock_bh(&nc_path->packet_list_lock); return true; } /** * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet * buffer * @skb: data skb to forward * @neigh_node: next hop to forward packet to * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) { const struct net_device *netdev = neigh_node->if_incoming->mesh_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* We only handle unicast packets */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; if (packet->packet_type != BATADV_UNICAST) goto out; /* Try to find a coding opportunity and send the skb if one is found */ if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) return true; /* Find or create a nc_path for this src-dst pair */ nc_path = batadv_nc_get_path(bat_priv, bat_priv->nc.coding_hash, ethhdr->h_source, neigh_node->addr); if (!nc_path) goto out; /* Add skb to nc_path */ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) goto free_nc_path; /* Packet is consumed */ return true; free_nc_path: batadv_nc_path_put(nc_path); out: /* Packet is not consumed */ return false; } /** * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be * used when decoding coded packets * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to store */ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto out; /* Check for supported packet type */ payload = skb_network_header(skb); packet = (struct batadv_unicast_packet *)payload; if (packet->packet_type != BATADV_UNICAST) goto out; /* Find existing nc_path or create a new */ nc_path = batadv_nc_get_path(bat_priv, bat_priv->nc.decoding_hash, ethhdr->h_source, ethhdr->h_dest); if (!nc_path) goto out; /* Clone skb and adjust skb->data to point at batman header */ skb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb)) goto free_nc_path; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto free_skb; if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) goto free_skb; /* Add skb to nc_path */ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); return; free_skb: kfree_skb(skb); free_nc_path: batadv_nc_path_put(nc_path); out: return; } /** * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet * should be saved in the decoding buffer and, if so, store it there * @bat_priv: the bat priv with all the mesh interface information * @skb: unicast skb to store */ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct ethhdr *ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) return; /* Set data pointer to MAC header to mimic packets from our tx path */ skb_push(skb, ETH_HLEN); batadv_nc_skb_store_for_decoding(bat_priv, skb); } /** * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored * in nc_packet * @bat_priv: the bat priv with all the mesh interface information * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * * Return: pointer to decoded unicast packet if the packet was decoded or NULL * in case of an error. */ static struct batadv_unicast_packet * batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_nc_packet *nc_packet) { const int h_size = sizeof(struct batadv_unicast_packet); const int h_diff = sizeof(struct batadv_coded_packet) - h_size; struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet coded_packet_tmp; struct ethhdr *ethhdr, ethhdr_tmp; u8 *orig_dest, ttl, ttvn; unsigned int coding_len; int err; /* Save headers temporarily */ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); if (skb_cow(skb, 0) < 0) return NULL; if (unlikely(!skb_pull_rcsum(skb, h_diff))) return NULL; /* Data points to batman header, so set mac header 14 bytes before * and network to data */ skb_set_mac_header(skb, -ETH_HLEN); skb_reset_network_header(skb); /* Reconstruct original mac header */ ethhdr = eth_hdr(skb); *ethhdr = ethhdr_tmp; /* Select the correct unicast header information based on the location * of our mac address in the coded_packet header */ if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { /* If we are the second destination the packet was overheard, * so the Ethernet address must be copied to h_dest and * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST */ ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); skb->pkt_type = PACKET_HOST; orig_dest = coded_packet_tmp.second_orig_dest; ttl = coded_packet_tmp.second_ttl; ttvn = coded_packet_tmp.second_ttvn; } else { orig_dest = coded_packet_tmp.first_orig_dest; ttl = coded_packet_tmp.ttl; ttvn = coded_packet_tmp.first_ttvn; } coding_len = ntohs(coded_packet_tmp.coded_len); if (coding_len > skb->len) return NULL; /* Here the magic is reversed: * extract the missing packet from the received coded packet */ batadv_nc_memxor(skb->data + h_size, nc_packet->skb->data + h_size, coding_len); /* Resize decoded skb if decoded with larger packet */ if (nc_packet->skb->len > coding_len + h_size) { err = pskb_trim_rcsum(skb, coding_len + h_size); if (err) return NULL; } /* Create decoded unicast packet */ unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_packet->packet_type = BATADV_UNICAST; unicast_packet->version = BATADV_COMPAT_VERSION; unicast_packet->ttl = ttl; ether_addr_copy(unicast_packet->dest, orig_dest); unicast_packet->ttvn = ttvn; batadv_nc_packet_free(nc_packet, false); return unicast_packet; } /** * batadv_nc_find_decoding_packet() - search through buffered decoding data to * find the data needed to decode the coded packet * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * * Return: pointer to nc packet if the needed data was found or NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, struct ethhdr *ethhdr, struct batadv_coded_packet *coded) { struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; struct batadv_nc_path *nc_path, nc_path_key; u8 *dest, *source; __be32 packet_id; int index; if (!hash) return NULL; /* Select the correct packet id based on the location of our mac-addr */ dest = ethhdr->h_source; if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { source = coded->second_source; packet_id = coded->second_crc; } else { source = coded->first_source; packet_id = coded->first_crc; } batadv_nc_hash_key_gen(&nc_path_key, source, dest); index = batadv_nc_hash_choose(&nc_path_key, hash->size); /* Search for matching coding path */ rcu_read_lock(); hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { /* Find matching nc_packet */ spin_lock_bh(&nc_path->packet_list_lock); list_for_each_entry(tmp_nc_packet, &nc_path->packet_list, list) { if (packet_id == tmp_nc_packet->packet_id) { list_del(&tmp_nc_packet->list); nc_packet = tmp_nc_packet; break; } } spin_unlock_bh(&nc_path->packet_list_lock); if (nc_packet) break; } rcu_read_unlock(); if (!nc_packet) batadv_dbg(BATADV_DBG_NC, bat_priv, "No decoding packet found for %u\n", packet_id); return nc_packet; } /** * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on * * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet *coded_packet; struct batadv_nc_packet *nc_packet; struct ethhdr *ethhdr; int hdr_size = sizeof(*coded_packet); /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) goto free_skb; /* Make sure we can access (and remove) header */ if (unlikely(!pskb_may_pull(skb, hdr_size))) goto free_skb; coded_packet = (struct batadv_coded_packet *)skb->data; ethhdr = eth_hdr(skb); /* Verify frame is destined for us */ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) goto free_skb; /* Update stat counter */ if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, coded_packet); if (!nc_packet) { batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); goto free_skb; } /* Make skb's linear, because decoding accesses the entire buffer */ if (skb_linearize(skb) < 0) goto free_nc_packet; if (skb_linearize(nc_packet->skb) < 0) goto free_nc_packet; /* Decode the packet */ unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); if (!unicast_packet) { batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); goto free_nc_packet; } /* Mark packet as decoded to do correct recoding when forwarding */ BATADV_SKB_CB(skb)->decoded = true; batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, skb->len + ETH_HLEN); return batadv_recv_unicast_packet(skb, recv_if); free_nc_packet: batadv_nc_packet_free(nc_packet, true); free_skb: kfree_skb(skb); return NET_RX_DROP; } /** * batadv_nc_mesh_free() - clean up network coding memory * @bat_priv: the bat priv with all the mesh interface information */ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) { batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1); cancel_delayed_work_sync(&bat_priv->nc.work); batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); batadv_hash_destroy(bat_priv->nc.coding_hash); batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); batadv_hash_destroy(bat_priv->nc.decoding_hash); }
507 510 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/self: */ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); ret = 0; } else { dput(self); } } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); else fs_info->proc_self = self; return ret; } void __init proc_self_init(void) { proc_alloc_inum(&self_inum); }
402 404 404 404 404 404 338 404 402 404 404 192 192 192 288 7093 6374 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu-vm.c - vmalloc area based chunk allocation * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ #include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { /* must not be used on pre-mapped chunk */ WARN_ON(chunk->immutable); return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /** * pcpu_get_pages - get temp pages array * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } /** * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * * Free pages [@page_start and @page_end) in @pages for all units. * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page = pages[pcpu_page_idx(cpu, i)]; if (page) __free_page(page); } } } /** * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end, gfp_t gfp) { unsigned int cpu, tcpu; int i; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); if (!*pagep) goto err; } } return 0; err: while (--i >= page_start) __free_page(pages[pcpu_page_idx(cpu, i)]); for_each_possible_cpu(tcpu) { if (tcpu == cpu) break; for (i = page_start; i < page_end; i++) __free_page(pages[pcpu_page_idx(tcpu, i)]); } return -ENOMEM; } /** * pcpu_pre_unmap_flush - flush cache prior to unmapping * @chunk: chunk the regions to be flushed belongs to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages in [@page_start,@page_end) of @chunk are about to be * unmapped. Flush cache. As each flushing trial can be very * expensive, issue flush on the whole region at once rather than * doing it for each cpu. This could be an overkill but is more * scalable. */ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * Corresponding elements in @pages were cleared by the caller and can * be used to carry information to pcpu_free_pages() which will be * called after all unmaps are finished. The caller should call * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page; page = pcpu_chunk_page(chunk, cpu, i); WARN_ON(!page); pages[pcpu_page_idx(cpu, i)] = page; } __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } } /** * pcpu_post_unmap_tlb_flush - flush TLB after unmapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush * TLB for the regions. This can be skipped if the area is to be * returned to vmalloc as vmalloc will handle TLB flushing lazily. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), PAGE_KERNEL, pages, PAGE_SHIFT); } /** * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * * For each cpu, map pages [@page_start,@page_end) into @chunk. The * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * * This function is responsible for setting up whatever is necessary for * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) goto err; for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); } return 0; err: for_each_possible_cpu(tcpu) { __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); if (tcpu == cpu) break; } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } /** * pcpu_post_map_flush - flush cache after mapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been mapped. Flush * cache. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp) { struct page **pages; pages = pcpu_get_pages(); if (!pages) return -ENOMEM; if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { pcpu_free_pages(chunk, pages, page_start, page_end); return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @page_start: the start page * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. * * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the * region back to vmalloc() which will lazily flush the tlb. * * CONTEXT: * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end) { struct page **pages; /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_unmap_pages(chunk, pages, page_start, page_end); pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; } chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; pcpu_stats_chunk_dealloc(); trace_percpu_destroy_chunk(chunk->base_addr); if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } static struct page *pcpu_addr_to_page(void *addr) { return vmalloc_to_page(addr); } static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) { /* no extra restriction */ return 0; } /** * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim * @chunk: chunk of interest * * This is the entry point for percpu reclaim. If a chunk qualifies, it is then * isolated and managed in separate lists at the back of pcpu_slot: sidelined * and to_depopulate respectively. The to_depopulate list holds chunks slated * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once * they are on this list. Once depopulated, they are moved onto the sidelined * list which enables them to be pulled back in for allocation if no other chunk * can suffice the allocation. */ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) { /* do not reclaim either the first chunk or reserved chunk */ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) return false; /* * If it is isolated, it may be on the sidelined list so move it back to * the to_depopulate list. If we hit at least 1/4 pages empty pages AND * there is no system-wide shortage of empty pages aside from this * chunk, move it to the to_depopulate list. */ return ((chunk->isolated && chunk->nr_empty_pop_pages) || (pcpu_nr_empty_pop_pages > (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); }
74 74 60 61 95 94 88 67 76 74 74 180 181 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peerlookup.h" #include "peer.h" #include "noise.h" static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { /* siphash gives us a secure 64bit number based on a random key. Since * the bits are uniformly distributed, we can then mask off to get the * bits we need. */ const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key); return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)]; } struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void) { struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; get_random_bytes(&table->key, sizeof(table->key)); hash_init(table->hashtable); mutex_init(&table->lock); return table; } void wg_pubkey_hashtable_add(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_add_head_rcu(&peer->pubkey_hash, pubkey_bucket(table, peer->handshake.remote_static)); mutex_unlock(&table->lock); } void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_del_init_rcu(&peer->pubkey_hash); mutex_unlock(&table->lock); } /* Returns a strong reference to a peer */ struct wg_peer * wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { struct wg_peer *iter_peer, *peer = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { peer = iter_peer; break; } } peer = wg_peer_get_maybe_zero(peer); rcu_read_unlock_bh(); return peer; } static struct hlist_head *index_bucket(struct index_hashtable *table, const __le32 index) { /* Since the indices are random and thus all bits are uniformly * distributed, we can find its bucket simply by masking. */ return &table->hashtable[(__force u32)index & (HASH_SIZE(table->hashtable) - 1)]; } struct index_hashtable *wg_index_hashtable_alloc(void) { struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; hash_init(table->hashtable); spin_lock_init(&table->lock); return table; } /* At the moment, we limit ourselves to 2^20 total peers, which generally might * amount to 2^20*3 items in this hashtable. The algorithm below works by * picking a random number and testing it. We can see that these limits mean we * usually succeed pretty quickly: * * >>> def calculation(tries, size): * ... return (size / 2**32)**(tries - 1) * (1 - (size / 2**32)) * ... * >>> calculation(1, 2**20 * 3) * 0.999267578125 * >>> calculation(2, 2**20 * 3) * 0.0007318854331970215 * >>> calculation(3, 2**20 * 3) * 5.360489012673497e-07 * >>> calculation(4, 2**20 * 3) * 3.9261394135792216e-10 * * At the moment, we don't do any masking, so this algorithm isn't exactly * constant time in either the random guessing or in the hash list lookup. We * could require a minimum of 3 tries, which would successfully mask the * guessing. this would not, however, help with the growing hash lengths, which * is another thing to consider moving forward. */ __le32 wg_index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) { struct index_hashtable_entry *existing_entry; spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); rcu_read_lock_bh(); search_unused_slot: /* First we try to find an unused slot, randomly, while unlocked. */ entry->index = (__force __le32)get_random_u32(); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) /* If it's already in use, we continue searching. */ goto search_unused_slot; } /* Once we've found an unused slot, we lock it, and then double-check * that nobody else stole it from us. */ spin_lock_bh(&table->lock); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) { spin_unlock_bh(&table->lock); /* If it was stolen, we start over. */ goto search_unused_slot; } } /* Otherwise, we know we have it exclusively (since we're locked), * so we insert. */ hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); spin_unlock_bh(&table->lock); rcu_read_unlock_bh(); return entry->index; } bool wg_index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) { bool ret; spin_lock_bh(&table->lock); ret = !hlist_unhashed(&old->index_hash); if (unlikely(!ret)) goto out; new->index = old->index; hlist_replace_rcu(&old->index_hash, &new->index_hash); /* Calling init here NULLs out index_hash, and in fact after this * function returns, it's theoretically possible for this to get * reinserted elsewhere. That means the RCU lookup below might either * terminate early or jump between buckets, in which case the packet * simply gets dropped, which isn't terrible. */ INIT_HLIST_NODE(&old->index_hash); out: spin_unlock_bh(&table->lock); return ret; } void wg_index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) { spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); } /* Returns a strong reference to a entry->peer */ struct index_hashtable_entry * wg_index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index, struct wg_peer **peer) { struct index_hashtable_entry *iter_entry, *entry = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index), index_hash) { if (iter_entry->index == index) { if (likely(iter_entry->type & type_mask)) entry = iter_entry; break; } } if (likely(entry)) { entry->peer = wg_peer_get_maybe_zero(entry->peer); if (likely(entry->peer)) *peer = entry->peer; else entry = NULL; } rcu_read_unlock_bh(); return entry; }
33 32 34 33 2 32 53 8 10 3 2 11 6 11 18 9 1 4 11 11 11 10 1 35 1 1 5 4 2 8 7 15 8 5 8 8 8 8 45 22 7 40 25 2 14 39 44 45 28 26 2 27 27 61 73 61 1 57 14 20 23 3 15 5 34 34 21 56 56 57 39 1 12 21 5 37 39 53 53 16 20 1 24 20 17 27 24 21 43 7 17 19 6 37 6 36 43 53 38 10 2 26 32 31 8 9 1 11 18 13 16 20 9 29 6 20 6 21 5 22 28 31 18 5 1 13 26 2 16 8 15 9 26 17 1 8 8 7 8 17 2221 62 18 16 20 22 19 14 31 8 23 5 1 1 2 1 22 1 20 3 30 9 2 41 41 51 35 35 41 6 35 114 2 19 3 91 20 45 47 89 103 4 20 88 100 1 7 58 1 4 52 65 1 1 1 1 1 9 77 23 1 52 74 1 74 8 73 68 4 72 49 8 60 14 46 28 28 18 33 46 13 33 9 37 44 44 3 4 1 2 7 46 46 10 36 1 2 1 69 68 3 10 3 51 4 3 3 2 3 6 4 5 5 5 8 3 10 35 23 2 2 1 1 1 1 1 6 1 1 1 2 2 1 3 1 2 2 542 26 21 1 1 3 2 1 1 1 1 5 29 3 3 1 263 1 2 5 2 1 2 1 1 1 2 1 1 1 1 1 1 1 66 1 2 2 1 2 1 2 1 1 1 2 2 1 4 1 10 2 2 1 2 2 1 1 1 1 2 1 6 6 1 1 2 1 2 1 1 1 1 1 1 1 4 1 43 43 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/sys.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/export.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/utsname.h> #include <linux/mman.h> #include <linux/reboot.h> #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kmod.h> #include <linux/ksm.h> #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/key.h> #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> #include <linux/random.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> #include <linux/syscall_user_dispatch.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> #include <linux/sched/loadavg.h> #include <linux/sched/stat.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> #include <linux/cred.h> #include <linux/nospec.h> #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> #include <trace/events/task.h> #include "uid16.h" #ifndef SET_UNALIGN_CTL # define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL # define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL # define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL # define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL # define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN # define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN # define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) #endif #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif #ifndef SVE_SET_VL # define SVE_SET_VL(a) (-EINVAL) #endif #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif #ifndef SME_SET_VL # define SME_SET_VL(a) (-EINVAL) #endif #ifndef SME_GET_VL # define SME_GET_VL() (-EINVAL) #endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif #ifndef PAC_SET_ENABLED_KEYS # define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL) #endif #ifndef PAC_GET_ENABLED_KEYS # define PAC_GET_ENABLED_KEYS(a) (-EINVAL) #endif #ifndef SET_TAGGED_ADDR_CTRL # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) #endif #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif #ifndef RISCV_V_SET_CONTROL # define RISCV_V_SET_CONTROL(a) (-EINVAL) #endif #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif #ifndef RISCV_SET_ICACHE_FLUSH_CTX # define RISCV_SET_ICACHE_FLUSH_CTX(a, b) (-EINVAL) #endif #ifndef PPC_GET_DEXCR_ASPECT # define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) #endif #ifndef PPC_SET_DEXCR_ASPECT # define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) #endif /* * this is where the system-wide overflow UID and GID are defined, for * architectures that now have 32-bit UID/GID but didn't in the past */ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); /* * the same as above, but for filesystems which can only store a 16-bit * UID and GID. as such, this is needed on all architectures */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * * Called with rcu_read_lock, creds are safe */ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); if (uid_eq(pcred->uid, cred->euid) || uid_eq(pcred->euid, cred->euid)) return true; if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } /* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } if (niceval < task_nice(p) && !can_nice(p, niceval)) { error = -EACCES; goto out; } no_nice = security_task_setnice(p, niceval); if (no_nice) { error = no_nice; goto out; } if (error == -ESRCH) error = 0; set_user_nice(p, niceval); out: return error; } SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; if (niceval < MIN_NICE) niceval = MIN_NICE; if (niceval > MAX_NICE) niceval = MAX_NICE; rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: rcu_read_unlock(); out: return error; } /* * Ugh. To avoid negative return values, "getpriority()" will * not return the normal nice-value, but a negated value that * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who) p = find_task_by_vpid(who); else p = current; if (p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } break; case PRIO_PGRP: if (who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) uid = cred->uid; else if (!uid_eq(uid, cred->uid)) { user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */ } for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: rcu_read_unlock(); return retval; } /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) * * If you set the real gid at all, or set the effective gid to a value not * equal to the real gid, then the saved gid is set to the new effective gid. * * This makes it possible for a setgid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ #ifdef CONFIG_MULTIUSER long __sys_setregid(gid_t rgid, gid_t egid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RE); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { return __sys_setregid(rgid, egid); } /* * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ long __sys_setgid(gid_t gid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t kgid; kgid = make_kgid(ns, gid); if (!gid_valid(kgid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; else goto error; retval = security_task_fix_setgid(new, old, LSM_SETID_ID); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setgid, gid_t, gid) { return __sys_setgid(gid); } /* * change the user struct in a credentials set to match the new UID */ static int set_user(struct cred *new) { struct user_struct *new_user; new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; free_uid(new->user); new->user = new_user; return 0; } static void flag_nproc_exceeded(struct cred *new) { if (new->ucounts == current_ucounts()) return; /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming * it never fails if called by root. We may still enforce NPROC limit * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; } /* * Unprivileged users may change the real uid to the effective uid * or vice versa. (BSD-style) * * If you set the real uid at all, or set the effective uid to a value not * equal to the real uid, then the saved uid is set to the new effective uid. * * This makes it possible for a setuid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ long __sys_setreuid(uid_t ruid, uid_t euid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { new->euid = keuid; if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RE); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { return __sys_setreuid(ruid, euid); } /* * setuid() is implemented like SysV with SAVED_IDS * * Note that SAVED_ID's is deficient in that a setuid root program * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ long __sys_setuid(uid_t uid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kuid; kuid = make_kuid(ns, uid); if (!uid_valid(kuid)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); retval = -EPERM; if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE1(setuid, uid_t, uid) { return __sys_setuid(uid); } /* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kuid_t kruid, keuid, ksuid; bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); ksuid = make_kuid(ns, suid); if ((ruid != (uid_t) -1) && !uid_valid(kruid)) return -EINVAL; if ((euid != (uid_t) -1) && !uid_valid(keuid)) return -EINVAL; if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && uid_eq(keuid, old->fsuid))) && (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) return 0; ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); if ((ruid_new || euid_new || suid_new) && !ns_capable_setid(old->user_ns, CAP_SETUID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) new->euid = keuid; if (suid != (uid_t) -1) new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); if (retval < 0) goto error; retval = set_cred_ucounts(new); if (retval < 0) goto error; flag_nproc_exceeded(new); return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { return __sys_setresuid(ruid, euid, suid); } SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; uid_t ruid, euid, suid; ruid = from_kuid_munged(cred->user_ns, cred->uid); euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); retval = put_user(ruid, ruidp); if (!retval) { retval = put_user(euid, euidp); if (!retval) return put_user(suid, suidp); } return retval; } /* * Same as above, but for rgid, egid, sgid. */ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) { struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; kgid_t krgid, kegid, ksgid; bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); ksgid = make_kgid(ns, sgid); if ((rgid != (gid_t) -1) && !gid_valid(krgid)) return -EINVAL; if ((egid != (gid_t) -1) && !gid_valid(kegid)) return -EINVAL; if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; old = current_cred(); /* check for no-op */ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && gid_eq(kegid, old->fsgid))) && (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) return 0; rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); if ((rgid_new || egid_new || sgid_new) && !ns_capable_setid(old->user_ns, CAP_SETGID)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (rgid != (gid_t) -1) new->gid = krgid; if (egid != (gid_t) -1) new->egid = kegid; if (sgid != (gid_t) -1) new->sgid = ksgid; new->fsgid = new->egid; retval = security_task_fix_setgid(new, old, LSM_SETID_RES); if (retval < 0) goto error; return commit_creds(new); error: abort_creds(new); return retval; } SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { return __sys_setresgid(rgid, egid, sgid); } SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; gid_t rgid, egid, sgid; rgid = from_kgid_munged(cred->user_ns, cred->gid); egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); retval = put_user(rgid, rgidp); if (!retval) { retval = put_user(egid, egidp); if (!retval) retval = put_user(sgid, sgidp); } return retval; } /* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ long __sys_setfsuid(uid_t uid) { const struct cred *old; struct cred *new; uid_t old_fsuid; kuid_t kuid; old = current_cred(); old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); kuid = make_kuid(old->user_ns, uid); if (!uid_valid(kuid)) return old_fsuid; new = prepare_creds(); if (!new) return old_fsuid; if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsuid; change_okay: commit_creds(new); return old_fsuid; } SYSCALL_DEFINE1(setfsuid, uid_t, uid) { return __sys_setfsuid(uid); } /* * Samma på svenska.. */ long __sys_setfsgid(gid_t gid) { const struct cred *old; struct cred *new; gid_t old_fsgid; kgid_t kgid; old = current_cred(); old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); kgid = make_kgid(old->user_ns, gid); if (!gid_valid(kgid)) return old_fsgid; new = prepare_creds(); if (!new) return old_fsgid; if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) goto change_okay; } } abort_creds(new); return old_fsgid; change_okay: commit_creds(new); return old_fsgid; } SYSCALL_DEFINE1(setfsgid, gid_t, gid) { return __sys_setfsgid(gid); } #endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process * * Note, despite the name, this returns the tgid not the pid. The tgid and * the pid are identical unless CLONE_THREAD was specified on clone() in * which case the tgid is the same in all threads of the same group. * * This is SMP safe as current->tgid does not change. */ SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); } /* Thread ID - the internal kernel "pid" */ SYSCALL_DEFINE0(gettid) { return task_pid_vnr(current); } /* * Accessing ->real_parent is not SMP-safe, it could * change from under us. However, we can use a stale * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct). */ SYSCALL_DEFINE0(getppid) { int pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_uid()); } SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ return from_kuid_munged(current_user_ns(), current_euid()); } SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_gid()); } SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ return from_kgid_munged(current_user_ns(), current_egid()); } static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; tms->tms_utime = nsec_to_clock_t(tgutime); tms->tms_stime = nsec_to_clock_t(tgstime); tms->tms_cutime = nsec_to_clock_t(cutime); tms->tms_cstime = nsec_to_clock_t(cstime); } SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { if (tbuf) { struct tms tmp; do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } #ifdef CONFIG_COMPAT static compat_clock_t clock_t_to_compat_clock_t(clock_t x) { return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; struct compat_tms tmp; do_sys_times(&tms); /* Convert our struct tms to the compat version. */ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } #endif /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully * understand sessions/pgrp etc. Let somebody who does explain it. * * OK, I think I have the protection semantics right.... this is really * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; if (!pid) pid = task_pid_vnr(group_leader); if (!pgid) pgid = pid; if (pgid < 0) return -EINVAL; rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM */ write_lock_irq(&tasklist_lock); err = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; err = -EINVAL; if (!thread_group_leader(p)) goto out; if (same_thread_group(p->real_parent, group_leader)) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; if (p != group_leader) goto out; } err = -EPERM; if (p->signal->leader) goto out; pgrp = task_pid(p); if (pgid != pid) { struct task_struct *g; pgrp = find_vpid(pgid); g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out; } err = security_task_setpgid(p, pgid); if (err) goto out; if (task_pgrp(p) != pgrp) change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); free_pids(pids); return err; } static int do_getpgid(pid_t pid) { struct task_struct *p; struct pid *grp; int retval; rcu_read_lock(); if (!pid) grp = task_pgrp(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; grp = task_pgrp(p); if (!grp) goto out; retval = security_task_getpgid(p); if (retval) goto out; } retval = pid_vnr(grp); out: rcu_read_unlock(); return retval; } SYSCALL_DEFINE1(getpgid, pid_t, pid) { return do_getpgid(pid); } #ifdef __ARCH_WANT_SYS_GETPGRP SYSCALL_DEFINE0(getpgrp) { return do_getpgid(0); } #endif SYSCALL_DEFINE1(getsid, pid_t, pid) { struct task_struct *p; struct pid *sid; int retval; rcu_read_lock(); if (!pid) sid = task_session(current); else { retval = -ESRCH; p = find_task_by_vpid(pid); if (!p) goto out; sid = task_session(p); if (!sid) goto out; retval = security_task_getsid(p); if (retval) goto out; } retval = pid_vnr(sid); out: rcu_read_unlock(); return retval; } static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ if (group_leader->signal->leader) goto out; /* Fail if a process group id already exists that equals the * proposed session id. */ if (pid_task(sid, PIDTYPE_PGID)) goto out; group_leader->signal->leader = 1; set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); } return err; } SYSCALL_DEFINE0(setsid) { return ksys_setsid(); } DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else #define override_architecture(name) 0 #endif /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be * 2.6.60. */ static int override_release(char __user *release, size_t len) { int ret = 0; if (current->personality & UNAME26) { const char *rest = UTS_RELEASE; char buf[65] = { 0 }; int ndots = 0; unsigned v; size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) break; if (!isdigit(*rest) && *rest != '.') break; rest++; } v = LINUX_VERSION_PATCHLEVEL + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); } return ret; } SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { struct new_utsname tmp; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME /* * Old cruft */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; if (override_architecture(name)) return -EFAULT; return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { struct oldold_utsname tmp; if (!name) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT; if (override_architecture(name)) return -EFAULT; if (override_release(name->release, sizeof(name->release))) return -EFAULT; return 0; } #endif SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); } return errno; } #ifdef __ARCH_WANT_SYS_GETHOSTNAME SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i; struct new_utsname *u; char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; down_read(&uts_sem); u = utsname(); i = 1 + strlen(u->nodename); if (i > len) i = len; memcpy(tmp, u->nodename, i); up_read(&uts_sem); if (copy_to_user(name, tmp, i)) return -EFAULT; return 0; } #endif /* * Only setdomainname; getdomainname can be implemented by calling * uname() */ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); } return errno; } /* make sure you are allowed to change @tsk limits before calling this */ static int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim) { struct rlimit *rlim; int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open) return -EPERM; } /* Holding a refcount on tsk protects tsk->signal from disappearing. */ rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { /* * Keep the capable check against init_user_ns until cgroups can * contain all limits. */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; if (!retval) retval = security_task_setrlimit(tsk, resource, new_rlim); } if (!retval) { if (old_rlim) *old_rlim = *rlim; if (new_rlim) *rlim = *new_rlim; } task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && new_rlim->rlim_cur != RLIM_INFINITY && IS_ENABLED(CONFIG_POSIX_TIMERS)) { /* * update_rlimit_cpu can fail if the task is exiting, but there * may be other tasks in the thread group that are not exiting, * and they need their cpu timers adjusted. * * The group_leader is the last task to be released, so if we * cannot update_rlimit_cpu on it, then the entire process is * exiting and we do not need to update at all. */ update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur); } return retval; } SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit value; int ret; ret = do_prlimit(current, resource, NULL, &value); if (!ret) ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; struct compat_rlimit r32; if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) return -EFAULT; if (r32.rlim_cur == COMPAT_RLIM_INFINITY) r.rlim_cur = RLIM_INFINITY; else r.rlim_cur = r32.rlim_cur; if (r32.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; else r.rlim_max = r32.rlim_max; return do_prlimit(current, resource, &r, NULL); } COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; ret = do_prlimit(current, resource, NULL, &r); if (!ret) { struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else r32.rlim_cur = r.rlim_cur; if (r.rlim_max > COMPAT_RLIM_INFINITY) r32.rlim_max = COMPAT_RLIM_INFINITY; else r32.rlim_max = r.rlim_max; if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) return -EFAULT; } return ret; } #endif #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* * Back compatibility for getrlimit. Needed for some apps. */ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { struct rlimit r; if (resource >= RLIM_NLIMITS) return -EINVAL; resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); if (r.rlim_cur > 0x7FFFFFFF) r.rlim_cur = 0x7FFFFFFF; if (r.rlim_max > 0x7FFFFFFF) r.rlim_max = 0x7FFFFFFF; if (put_user(r.rlim_cur, &rlim->rlim_cur) || put_user(r.rlim_max, &rlim->rlim_max)) return -EFAULT; return 0; } #endif #endif static inline bool rlim64_is_infinity(__u64 rlim64) { #if BITS_PER_LONG < 64 return rlim64 >= ULONG_MAX; #else return rlim64 == RLIM64_INFINITY; #endif } static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) { if (rlim->rlim_cur == RLIM_INFINITY) rlim64->rlim_cur = RLIM64_INFINITY; else rlim64->rlim_cur = rlim->rlim_cur; if (rlim->rlim_max == RLIM_INFINITY) rlim64->rlim_max = RLIM64_INFINITY; else rlim64->rlim_max = rlim->rlim_max; } static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) { if (rlim64_is_infinity(rlim64->rlim_cur)) rlim->rlim_cur = RLIM_INFINITY; else rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; if (rlim64_is_infinity(rlim64->rlim_max)) rlim->rlim_max = RLIM_INFINITY; else rlim->rlim_max = (unsigned long)rlim64->rlim_max; } /* rcu lock must be held */ static int check_prlimit_permission(struct task_struct *task, unsigned int flags) { const struct cred *cred = current_cred(), *tcred; bool id_match; if (current == task) return 0; tcred = __task_cred(task); id_match = (uid_eq(cred->uid, tcred->euid) && uid_eq(cred->uid, tcred->suid) && uid_eq(cred->uid, tcred->uid) && gid_eq(cred->gid, tcred->egid) && gid_eq(cred->gid, tcred->sgid) && gid_eq(cred->gid, tcred->gid)); if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return -EPERM; return security_task_prlimit(cred, tcred, flags); } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim) { struct rlimit64 old64, new64; struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; int ret; if (old_rlim) checkflags |= LSM_PRLIMIT_READ; if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT; rlim64_to_rlim(&new64, &new); checkflags |= LSM_PRLIMIT_WRITE; } rcu_read_lock(); tsk = pid ? find_task_by_vpid(pid) : current; if (!tsk) { rcu_read_unlock(); return -ESRCH; } ret = check_prlimit_permission(tsk, checkflags); if (ret) { rcu_read_unlock(); return ret; } get_task_struct(tsk); rcu_read_unlock(); ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, old_rlim ? &old : NULL); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); if (copy_to_user(old_rlim, &old64, sizeof(old64))) ret = -EFAULT; } put_task_struct(tsk); return ret; } SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; return do_prlimit(current, resource, &new_rlim, NULL); } /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After * task_struct gets moved into malloc'ed memory, it would * make sense to do this. It will make moving the rest of the information * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't * care which for the sums. We always take the siglock to protect reading * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * * Locking: * We need to take the siglock for CHILDEREN, SELF and BOTH * for the cases current multithreaded, non-current single threaded * non-current multithreaded. Thread traversal is now safe with * the siglock held. * Strictly speaking, we donot need to take the siglock if we are current and * single threaded, as no one else can take our signal_struct away, no one * else can reap the children to update signal->c* counters, and no one else * can race with the signal-> fields. If we do not take any lock, the * signal-> fields could be read out of order while another thread was just * exiting. So we should place a read memory barrier when we avoid the lock. * On the writer side, write memory barrier is implied in __exit_signal * as __exit_signal releases the siglock spinlock after updating the signal-> * fields. But we don't do this yet to keep things simple. * */ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; r->ru_majflt += t->maj_flt; r->ru_inblock += task_io_get_inblock(t); r->ru_oublock += task_io_get_oublock(t); } void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; unsigned long maxrss; struct mm_struct *mm; struct signal_struct *sig = p->signal; unsigned int seq = 0; retry: memset(r, 0, sizeof(*r)); utime = stime = 0; maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; goto out_thread; } flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: utime = sig->cutime; stime = sig->cstime; r->ru_nvcsw = sig->cnvcsw; r->ru_nivcsw = sig->cnivcsw; r->ru_minflt = sig->cmin_flt; r->ru_majflt = sig->cmaj_flt; r->ru_inblock = sig->cinblock; r->ru_oublock = sig->coublock; maxrss = sig->cmaxrss; if (who == RUSAGE_CHILDREN) break; fallthrough; case RUSAGE_SELF: r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; r->ru_majflt += sig->maj_flt; r->ru_inblock += sig->inblock; r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); rcu_read_unlock(); break; default: BUG(); } if (need_seqretry(&sig->stats_lock, seq)) { seq = 1; goto retry; } done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (who == RUSAGE_CHILDREN) goto out_children; thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; out_thread: mm = get_task_mm(p); if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); } out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ r->ru_utime = ns_to_kernel_old_timeval(utime); r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) { struct rusage r; if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(&current->fs->umask, mask & S_IRWXUGO); return mask; } static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { CLASS(fd, exe)(fd); struct inode *inode; int err; if (fd_empty(exe)) return -EBADF; inode = file_inode(fd_file(exe)); /* * Because the original mm->exe_file points to executable file, make * sure that this one is executable as well, to avoid breaking an * overall picture. */ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) return -EACCES; err = file_permission(fd_file(exe), MAY_EXEC); if (err) return err; return replace_mm_exe_file(mm, fd_file(exe)); } /* * Check arithmetic relations of passed addresses. * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; int error = -EINVAL, i; static const unsigned char offsets[] = { offsetof(struct prctl_mm_map, start_code), offsetof(struct prctl_mm_map, end_code), offsetof(struct prctl_mm_map, start_data), offsetof(struct prctl_mm_map, end_data), offsetof(struct prctl_mm_map, start_brk), offsetof(struct prctl_mm_map, brk), offsetof(struct prctl_mm_map, start_stack), offsetof(struct prctl_mm_map, arg_start), offsetof(struct prctl_mm_map, arg_end), offsetof(struct prctl_mm_map, env_start), offsetof(struct prctl_mm_map, env_end), }; /* * Make sure the members are not somewhere outside * of allowed address space. */ for (i = 0; i < ARRAY_SIZE(offsets); i++) { u64 val = *(u64 *)((char *)prctl_map + offsets[i]); if ((unsigned long)val >= mmap_max_addr || (unsigned long)val < mmap_min_addr) goto out; } /* * Make sure the pairs are ordered. */ #define __prctl_check_order(__m1, __op, __m2) \ ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); if (error) goto out; #undef __prctl_check_order error = -EINVAL; /* * Neither we should allow to override limits if they set. */ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, prctl_map->start_brk, prctl_map->end_data, prctl_map->start_data)) goto out; error = 0; out: return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; unsigned long user_auxv[AT_VECTOR_SIZE]; struct mm_struct *mm = current->mm; int error; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); if (opt == PR_SET_MM_MAP_SIZE) return put_user((unsigned int)sizeof(prctl_map), (unsigned int __user *)addr); if (data_size != sizeof(prctl_map)) return -EINVAL; if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { /* * Someone is trying to cheat the auxv vector. */ if (!prctl_map.auxv || prctl_map.auxv_size > sizeof(mm->saved_auxv)) return -EINVAL; memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, prctl_map.auxv_size)) return -EFAULT; /* Last entry must be AT_NULL as specification requires */ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } if (prctl_map.exe_fd != (u32)-1) { /* * Check if the current user is checkpoint/restore capable. * At the time of this writing, it checks for CAP_SYS_ADMIN * or CAP_CHECKPOINT_RESTORE. * Note that a user with access to ptrace can masquerade an * arbitrary program as any executable, even setuid ones. * This may have implications in the tomoyo subsystem. */ if (!checkpoint_restore_ns_capable(current_user_ns())) return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; } /* * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); /* * We don't validate if these members are pointing to * real present VMAs because application may have correspond * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus * if someone reads this member in procfs while we're * updating -- it may get partly updated results. It's * known and acceptable trade off: we leave it as is to * not introduce additional locks here making the kernel * more complex. */ if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, unsigned long len) { /* * This doesn't move the auxiliary vector itself since it's pinned to * mm_struct, but it permits filling the vector with new values. It's * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy. */ unsigned long user_auxv[AT_VECTOR_SIZE] = {}; if (len > sizeof(user_auxv)) return -EINVAL; if (copy_from_user(user_auxv, (const void __user *)addr, len)) return -EFAULT; /* Make sure the last entry is always AT_NULL */ user_auxv[AT_VECTOR_SIZE - 2] = 0; user_auxv[AT_VECTOR_SIZE - 1] = 0; BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); task_lock(current); memcpy(mm->saved_auxv, user_auxv, len); task_unlock(current); return 0; } static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; struct prctl_mm_map prctl_map = { .auxv = NULL, .auxv_size = 0, .exe_fd = -1, }; struct vm_area_struct *vma; int error; if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && opt != PR_SET_MM_MAP && opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; #ifdef CONFIG_CHECKPOINT_RESTORE if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) return prctl_set_mm_map(opt, (const void __user *)addr, arg4); #endif if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; /* * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; prctl_map.end_data = mm->end_data; prctl_map.start_brk = mm->start_brk; prctl_map.brk = mm->brk; prctl_map.start_stack = mm->start_stack; prctl_map.arg_start = mm->arg_start; prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; switch (opt) { case PR_SET_MM_START_CODE: prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: prctl_map.end_data = addr; break; case PR_SET_MM_START_STACK: prctl_map.start_stack = addr; break; case PR_SET_MM_START_BRK: prctl_map.start_brk = addr; break; case PR_SET_MM_BRK: prctl_map.brk = addr; break; case PR_SET_MM_ARG_START: prctl_map.arg_start = addr; break; case PR_SET_MM_ARG_END: prctl_map.arg_end = addr; break; case PR_SET_MM_ENV_START: prctl_map.env_start = addr; break; case PR_SET_MM_ENV_END: prctl_map.env_end = addr; break; default: goto out; } error = validate_prctl_map_addr(&prctl_map); if (error) goto out; switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup * command line arguments and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: case PR_SET_MM_ARG_START: case PR_SET_MM_ARG_END: case PR_SET_MM_ENV_START: case PR_SET_MM_ENV_END: if (!vma) { error = -EFAULT; goto out; } } mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; mm->end_data = prctl_map.end_data; mm->start_brk = prctl_map.start_brk; mm->brk = prctl_map.brk; mm->start_stack = prctl_map.start_stack; mm->arg_start = prctl_map.arg_start; mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; error = 0; out: spin_unlock(&mm->arg_lock); mmap_read_unlock(mm); return error; } #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } #endif static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* * If task has has_child_subreaper - all its descendants * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in * it's subtree as they will never get out pidns. */ if (p->signal->has_child_subreaper || is_child_reaper(task_pid(p))) return 0; p->signal->has_child_subreaper = 1; return 1; } int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) { return -EINVAL; } int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, unsigned long ctrl) { return -EINVAL; } int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) { return -EINVAL; } int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) { return -EINVAL; } int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) { return -EINVAL; } #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME #define ANON_VMA_NAME_MAX_LEN 80 #define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" static inline bool is_valid_name_char(char ch) { /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ return ch > 0x1f && ch < 0x7f && !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); } static int prctl_set_vma(unsigned long opt, unsigned long addr, unsigned long size, unsigned long arg) { struct mm_struct *mm = current->mm; const char __user *uname; struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { char *name, *pch; name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); for (pch = name; *pch != '\0'; pch++) { if (!is_valid_name_char(*pch)) { kfree(name); return -EINVAL; } } /* anon_vma has its own copy */ anon_name = anon_vma_name_alloc(name); kfree(name); if (!anon_name) return -ENOMEM; } mmap_write_lock(mm); error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); anon_vma_name_put(anon_name); break; default: error = -EINVAL; } return error; } #else /* CONFIG_ANON_VMA_NAME */ static int prctl_set_vma(unsigned long opt, unsigned long start, unsigned long size, unsigned long arg) { return -EINVAL; } #endif /* CONFIG_ANON_VMA_NAME */ static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; if (test_bit(MMF_HAS_MDWE, &current->mm->flags)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags)) ret |= PR_MDWE_NO_INHERIT; return ret; } static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { unsigned long current_bits; if (arg3 || arg4 || arg5) return -EINVAL; if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) return -EINVAL; /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; /* * EOPNOTSUPP might be more appropriate here in principle, but * existing userspace depends on EINVAL specifically. */ if (!arch_memory_deny_write_exec_supported()) return -EINVAL; current_bits = get_current_mdwe(); if (current_bits && current_bits != bits) return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, &current->mm->flags); return 0; } static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) { struct mm_struct *mm = current->mm; unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len); if (size && copy_to_user(addr, mm->saved_auxv, size)) return -EFAULT; return sizeof(mm->saved_auxv); } SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; long error; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error != -ENOSYS) return error; error = 0; switch (option) { case PR_SET_PDEATHSIG: if (!valid_signal(arg2)) { error = -EINVAL; break; } me->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: error = get_dumpable(me->mm); break; case PR_SET_DUMPABLE: if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { error = -EINVAL; break; } set_dumpable(me->mm, arg2); break; case PR_SET_UNALIGN: error = SET_UNALIGN_CTL(me, arg2); break; case PR_GET_UNALIGN: error = GET_UNALIGN_CTL(me, arg2); break; case PR_SET_FPEMU: error = SET_FPEMU_CTL(me, arg2); break; case PR_GET_FPEMU: error = GET_FPEMU_CTL(me, arg2); break; case PR_SET_FPEXC: error = SET_FPEXC_CTL(me, arg2); break; case PR_GET_FPEXC: error = GET_FPEXC_CTL(me, arg2); break; case PR_GET_TIMING: error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; case PR_SET_NAME: comm[sizeof(me->comm) - 1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); break; case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; case PR_TASK_PERF_EVENTS_DISABLE: error = perf_event_task_disable(); break; case PR_TASK_PERF_EVENTS_ENABLE: error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: if (current->timer_slack_ns > ULONG_MAX) error = ULONG_MAX; else error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (rt_or_dl_task_policy(current)) break; if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; else current->timer_slack_ns = arg2; break; case PR_MCE_KILL: if (arg4 | arg5) return -EINVAL; switch (arg2) { case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; else if (arg3 == PR_MCE_KILL_DEFAULT) current->flags &= ~(PF_MCE_EARLY|PF_MCE_PROCESS); else return -EINVAL; break; default: return -EINVAL; } break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) return -EINVAL; if (current->flags & PF_MCE_PROCESS) error = (current->flags & PF_MCE_EARLY) ? PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; else error = PR_MCE_KILL_DEFAULT; break; case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; if (!arg2) break; walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, (int __user *)arg2); break; case PR_SET_NO_NEW_PRIVS: if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: /* No longer implemented: */ return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; case PR_SVE_SET_VL: error = SVE_SET_VL(arg2); break; case PR_SVE_GET_VL: error = SVE_GET_VL(); break; case PR_SME_SET_VL: error = SME_SET_VL(arg2); break; case PR_SME_GET_VL: error = SME_GET_VL(); break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_get(me, arg2); break; case PR_SET_SPECULATION_CTRL: if (arg4 || arg5) return -EINVAL; error = arch_prctl_spec_ctrl_set(me, arg2, arg3); break; case PR_PAC_RESET_KEYS: if (arg3 || arg4 || arg5) return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; case PR_PAC_SET_ENABLED_KEYS: if (arg4 || arg5) return -EINVAL; error = PAC_SET_ENABLED_KEYS(me, arg2, arg3); break; case PR_PAC_GET_ENABLED_KEYS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = PAC_GET_ENABLED_KEYS(me); break; case PR_SET_TAGGED_ADDR_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; error = SET_TAGGED_ADDR_CTRL(arg2); break; case PR_GET_TAGGED_ADDR_CTRL: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; case PR_SET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg3 || arg4 || arg5) return -EINVAL; if (arg2 == 1) current->flags |= PR_IO_FLUSHER; else if (!arg2) current->flags &= ~PR_IO_FLUSHER; else return -EINVAL; break; case PR_GET_IO_FLUSHER: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; case PR_SET_SYSCALL_USER_DISPATCH: error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; #ifdef CONFIG_SCHED_CORE case PR_SCHED_CORE: error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif case PR_SET_MDWE: error = prctl_set_mdwe(arg2, arg3, arg4, arg5); break; case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; case PR_PPC_GET_DEXCR: if (arg3 || arg4 || arg5) return -EINVAL; error = PPC_GET_DEXCR_ASPECT(me, arg2); break; case PR_PPC_SET_DEXCR: if (arg4 || arg5) return -EINVAL; error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; case PR_GET_AUXV: if (arg4 || arg5) return -EINVAL; error = prctl_get_auxv((void __user *)arg2, arg3); break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) return -EINVAL; if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) error = ksm_enable_merge_any(me->mm); else error = ksm_disable_merge_any(me->mm); mmap_write_unlock(me->mm); break; case PR_GET_MEMORY_MERGE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); break; #endif case PR_RISCV_V_SET_CONTROL: error = RISCV_V_SET_CONTROL(arg2); break; case PR_RISCV_V_GET_CONTROL: error = RISCV_V_GET_CONTROL(); break; case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; case PR_GET_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); break; case PR_SET_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_set_shadow_stack_status(me, arg2); break; case PR_LOCK_SHADOW_STACK_STATUS: if (arg3 || arg4 || arg5) return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; case PR_TIMER_CREATE_RESTORE_IDS: if (arg3 || arg4 || arg5) return -EINVAL; error = posixtimer_create_prctl(arg2); break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); if (cpup) err |= put_user(cpu, cpup); if (nodep) err |= put_user(cpu_to_node(cpu), nodep); return err ? -EFAULT : 0; } /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ static int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; struct timespec64 tp; memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) * is less than can be stored in a 32 bit unsigned long then * we can be binary compatible with 2.2.x kernels. If not, * well, in that case 2.2.x was broken anyways... * * -Erik Andersen <andersee@debian.org> */ mem_total = info->totalram + info->totalswap; if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; sav_total = mem_total; mem_total <<= 1; if (mem_total < sav_total) goto out; } /* * If mem_total did not overflow, multiply all memory values by * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ info->mem_unit = 1; info->totalram <<= bitcount; info->freeram <<= bitcount; info->sharedram <<= bitcount; info->bufferram <<= bitcount; info->totalswap <<= bitcount; info->freeswap <<= bitcount; info->totalhigh <<= bitcount; info->freehigh <<= bitcount; out: return 0; } SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) { struct sysinfo val; do_sysinfo(&val); if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_sysinfo { s32 uptime; u32 loads[3]; u32 totalram; u32 freeram; u32 sharedram; u32 bufferram; u32 totalswap; u32 freeswap; u16 procs; u16 pad; u32 totalhigh; u32 freehigh; u32 mem_unit; char _f[20-2*sizeof(u32)-sizeof(int)]; }; COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; struct compat_sysinfo s_32; do_sysinfo(&s); /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { s.mem_unit <<= 1; bitcount++; } s.totalram >>= bitcount; s.freeram >>= bitcount; s.sharedram >>= bitcount; s.bufferram >>= bitcount; s.totalswap >>= bitcount; s.freeswap >>= bitcount; s.totalhigh >>= bitcount; s.freehigh >>= bitcount; } memset(&s_32, 0, sizeof(s_32)); s_32.uptime = s.uptime; s_32.loads[0] = s.loads[0]; s_32.loads[1] = s.loads[1]; s_32.loads[2] = s.loads[2]; s_32.totalram = s.totalram; s_32.freeram = s.freeram; s_32.sharedram = s.sharedram; s_32.bufferram = s.bufferram; s_32.totalswap = s.totalswap; s_32.freeswap = s.freeswap; s_32.procs = s.procs; s_32.totalhigh = s.totalhigh; s_32.freehigh = s.freehigh; s_32.mem_unit = s.mem_unit; if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; return 0; } #endif /* CONFIG_COMPAT */
38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0-only */ /* * async.h: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> */ #ifndef __ASYNC_H__ #define __ASYNC_H__ #include <linux/types.h> #include <linux/list.h> #include <linux/numa.h> #include <linux/device.h> typedef u64 async_cookie_t; typedef void (*async_func_t) (void *data, async_cookie_t cookie); struct async_domain { struct list_head pending; unsigned registered:1; }; /* * domain participates in global async_synchronize_full */ #define ASYNC_DOMAIN(_name) \ struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \ .registered = 1 } /* * domain is free to go out of scope as soon as all pending work is * complete, this domain does not participate in async_synchronize_full */ #define ASYNC_DOMAIN_EXCLUSIVE(_name) \ struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \ .registered = 0 } async_cookie_t async_schedule_node(async_func_t func, void *data, int node); async_cookie_t async_schedule_node_domain(async_func_t func, void *data, int node, struct async_domain *domain); /** * async_schedule - schedule a function for asynchronous execution * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule(async_func_t func, void *data) { return async_schedule_node(func, data, NUMA_NO_NODE); } /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain * @func: function to execute asynchronously * @data: data pointer to pass to the function * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. * @domain may be used in the async_synchronize_*_domain() functions to * wait within a certain synchronization domain rather than globally. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain) { return async_schedule_node_domain(func, data, NUMA_NO_NODE, domain); } /** * async_schedule_dev - A device specific version of async_schedule * @func: function to execute asynchronously * @dev: device argument to be passed to function * * Returns an async_cookie_t that may be used for checkpointing later. * @dev is used as both the argument for the function and to provide NUMA * context for where to run the function. By doing this we can try to * provide for the best possible outcome by operating on the device on the * CPUs closest to the device. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_dev(async_func_t func, struct device *dev) { return async_schedule_node(func, dev, dev_to_node(dev)); } bool async_schedule_dev_nocall(async_func_t func, struct device *dev); /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously * @dev: device argument to be passed to function * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. * @dev is used as both the argument for the function and to provide NUMA * context for where to run the function. By doing this we can try to * provide for the best possible outcome by operating on the device on the * CPUs closest to the device. * @domain may be used in the async_synchronize_*_domain() functions to * wait within a certain synchronization domain rather than globally. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_dev_domain(async_func_t func, struct device *dev, struct async_domain *domain) { return async_schedule_node_domain(func, dev, dev_to_node(dev), domain); } extern void async_synchronize_full(void); extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain); extern bool current_is_async(void); extern void async_init(void); #endif
21 1 383 10 36 12 201 31 23 3 53 7 248 18 1 18 142 2 2 18 18 30 5 3 2 5 5 5 3 5 5 5 5 5 167 43 153 153 24 47 201 200 199 37 7 8 8 8 279 278 279 279 103 105 105 105 95 205 206 205 2 2 202 1 1 136 23 123 138 115 3 16 18 16 177 178 7 5 2 7 7 7 181 7 7 184 91 133 181 189 86 128 3 178 1 179 179 180 1 4 4 45 45 40 6 6 32 32 32 32 32 32 32 32 51 52 102 102 102 102 47 162 162 117 102 102 21 49 50 3 47 44 42 21 2 39 7 7 34 33 30 14 24 17 30 30 22 9 25 6 30 24 30 30 28 38 11 48 48 33 47 17 154 143 9 11 111 2 1 1 168 179 24 4 137 87 184 250 249 6 1 5 1 45 242 25 251 127 5 14 2 3 19 19 16 39 39 31 32 4 2 2 24 32 32 2 3 2 3 3 1 3 1 1 4 3 3 1 1 3 1 1 55 3 4 1 8 9 53 25 14 15 2 2 3 2 10 1 1 1 2 1 25 28 14 12 12 6 4 1 1 1 5 4 2 1 2 3 1 2 1 6 3 136 30 18 9 19 6 6 249 250 249 150 151 5 8 20 42 131 200 18 2 165 187 175 4 190 190 160 159 162 160 162 159 149 31 4 24 15 4 150 5 3 4 9 5 2 151 29 9 148 5 5 52 3 15 9 10 2 6 2 1 1 3 158 157 22 11 1 12 4 25 27 7 2 9 5 6 11 9 4 13 2 12 14 13 13 26 84 4 85 1 85 57 22 30 9 18 5 8 8 3 1 24 13 55 2 4 2 7 9 6 3 12 6 11 15 4 39 136 38 3 33 218 157 163 24 3 5 1 3 8 52 7 147 7 157 154 71 14 149 52 157 4 14 14 1 8 4 4 151 29 175 181 183 181 208 180 246 137 164 170 159 156 164 240 157 162 175 160 164 246 164 94 175 160 221 220 178 162 194 125 94 95 217 131 218 164 220 10 219 200 33 220 137 104 165 218 147 219 49 245 130 139 184 246 160 220 219 220 221 218 246 2 8 2 86 84 1 2 53 11 1 1 2 1 2 2 1 2 2 3 2 184 227 274 15 15 30 30 298 297 160 160 160 160 9 8 8 2 102 102 102 169 446 2 2 4 4 1 1 3 3 3 3 3 4 4 4 1 3 3 3 3 3 71 71 11 66 140 140 3 1 2 2 2 2 2 2 2 2 2 107 107 107 21 2 10 12 1 21 12 3 7 44 2 2 3 10 23 6 1 1 3 4 74 68 44 23 5 1 20 43 6 42 13 13 5 5 5 15 15 8 7 5 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Hopefully this will be a rather complete VT102 implementation. * * Beeping thanks to John T Kohl. * * Virtual Consoles, Screen Blanking, Screen Dumping, Color, Graphics * Chars, and VT100 enhancements by Peter MacDonald. * * Copy and paste function by Andrew Haylett, * some enhancements by Alessandro Rubini. * * Code to check for different video-cards mostly by Galen Hunt, * <g-hunt@ee.utah.edu> * * Rudimentary ISO 10646/Unicode/UTF-8 character set support by * Markus Kuhn, <mskuhn@immd4.informatik.uni-erlangen.de>. * * Dynamic allocation of consoles, aeb@cwi.nl, May 1994 * Resizing of consoles, aeb, 940926 * * Code for xterm like mouse click reporting by Peter Orbaek 20-Jul-94 * <poe@daimi.aau.dk> * * User-defined bell sound, new setterm control sequences and printk * redirection by Martin Mares <mj@k332.feld.cvut.cz> 19-Nov-95 * * APM screenblank bug fixed Takashi Manabe <manabe@roy.dsl.tutics.tut.jp> * * Merge with the abstract console driver by Geert Uytterhoeven * <geert@linux-m68k.org>, Jan 1997. * * Original m68k console driver modifications by * * - Arno Griffioen <arno@usn.nl> * - David Carter <carter@cs.bris.ac.uk> * * The abstract console driver provides a generic interface for a text * console. It supports VGA text mode, frame buffer based graphical consoles * and special graphics processors that are only accessible through some * registers (e.g. a TMS340x0 GSP). * * The interface to the hardware is specified using a special structure * (struct consw) which contains function pointers to console operations * (see <linux/console.h> for more information). * * Support for changeable cursor shape * by Pavel Machek <pavel@atrey.karlin.mff.cuni.cz>, August 1997 * * Ported to i386 and con_scrolldelta fixed * by Emmanuel Marty <core@ggi-project.org>, April 1998 * * Resurrected character buffers in videoram plus lots of other trickery * by Martin Mares <mj@atrey.karlin.mff.cuni.cz>, July 1998 * * Removed old-style timers, introduced console_timer, made timer * deletion SMP-safe. 17Jun00, Andrew Morton * * Removed console_lock, enabled interrupts across all console operations * 13 March 2001, Andrew Morton * * Fixed UTF-8 mode so alternate charset modes always work according * to control sequences interpreted in do_con_trol function * preserving backward VT100 semigraphics compatibility, * malformed UTF sequences represented as sequences of replacement glyphs, * original codes or '?' as a last resort if replacement glyph is undefined * by Adam Tla/lka <atlka@pg.gda.pl>, Aug 2006 */ #include <linux/module.h> #include <linux/types.h> #include <linux/sched/signal.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kd.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/console.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/tiocl.h> #include <linux/kbd_kern.h> #include <linux/consolemap.h> #include <linux/timer.h> #include <linux/interrupt.h> #include <linux/workqueue.h> #include <linux/pm.h> #include <linux/font.h> #include <linux/bitops.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/io.h> #include <linux/uaccess.h> #include <linux/kdb.h> #include <linux/ctype.h> #include <linux/bsearch.h> #include <linux/gcd.h> #define MAX_NR_CON_DRIVER 16 #define CON_DRIVER_FLAG_MODULE 1 #define CON_DRIVER_FLAG_INIT 2 #define CON_DRIVER_FLAG_ATTR 4 #define CON_DRIVER_FLAG_ZOMBIE 8 struct con_driver { const struct consw *con; const char *desc; struct device *dev; int node; int first; int last; int flag; }; static struct con_driver registered_con_driver[MAX_NR_CON_DRIVER]; const struct consw *conswitchp; /* * Here is the default bell parameters: 750HZ, 1/8th of a second */ #define DEFAULT_BELL_PITCH 750 #define DEFAULT_BELL_DURATION (HZ/8) #define DEFAULT_CURSOR_BLINK_MS 200 struct vc vc_cons [MAX_NR_CONSOLES]; EXPORT_SYMBOL(vc_cons); static const struct consw *con_driver_map[MAX_NR_CONSOLES]; static int con_open(struct tty_struct *, struct file *); static void vc_init(struct vc_data *vc, int do_clear); static void gotoxy(struct vc_data *vc, int new_x, int new_y); static void save_cur(struct vc_data *vc); static void reset_terminal(struct vc_data *vc, int do_clear); static void con_flush_chars(struct tty_struct *tty); static int set_vesa_blanking(u8 __user *mode); static void set_cursor(struct vc_data *vc); static void hide_cursor(struct vc_data *vc); static void console_callback(struct work_struct *ignored); static void con_driver_unregister_callback(struct work_struct *ignored); static void blank_screen_t(struct timer_list *unused); static void set_palette(struct vc_data *vc); static void unblank_screen(void); #define vt_get_kmsg_redirect() vt_kmsg_redirect(-1) int default_utf8 = true; module_param(default_utf8, int, S_IRUGO | S_IWUSR); int global_cursor_default = -1; module_param(global_cursor_default, int, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(global_cursor_default); static int cur_default = CUR_UNDERLINE; module_param(cur_default, int, S_IRUGO | S_IWUSR); /* * ignore_poke: don't unblank the screen when things are typed. This is * mainly for the privacy of braille terminal users. */ static int ignore_poke; int do_poke_blanked_console; int console_blanked; EXPORT_SYMBOL(console_blanked); static enum vesa_blank_mode vesa_blank_mode; static int vesa_off_interval; static int blankinterval; core_param(consoleblank, blankinterval, int, 0444); static DECLARE_WORK(console_work, console_callback); static DECLARE_WORK(con_driver_unregister_work, con_driver_unregister_callback); /* * fg_console is the current virtual console, * last_console is the last used one, * want_console is the console we want to switch to, * saved_* variants are for save/restore around kernel debugger enter/leave */ int fg_console; EXPORT_SYMBOL(fg_console); int last_console; int want_console = -1; static int saved_fg_console; static int saved_last_console; static int saved_want_console; static int saved_vc_mode; static int saved_console_blanked; /* * For each existing display, we have a pointer to console currently visible * on that display, allowing consoles other than fg_console to be refreshed * appropriately. Unless the low-level driver supplies its own display_fg * variable, we use this one for the "master display". */ static struct vc_data *master_display_fg; /* * Unfortunately, we need to delay tty echo when we're currently writing to the * console since the code is (and always was) not re-entrant, so we schedule * all flip requests to process context with schedule-task() and run it from * console_callback(). */ /* * For the same reason, we defer scrollback to the console callback. */ static int scrollback_delta; /* * Hook so that the power management routines can (un)blank * the console on our behalf. */ int (*console_blank_hook)(int); EXPORT_SYMBOL(console_blank_hook); static DEFINE_TIMER(console_timer, blank_screen_t); static int blank_state; static int blank_timer_expired; enum { blank_off = 0, blank_normal_wait, blank_vesa_wait, }; /* * /sys/class/tty/tty0/ * * the attribute 'active' contains the name of the current vc * console and it supports poll() to detect vc switches */ static struct device *tty0dev; /* * Notifier list for console events. */ static ATOMIC_NOTIFIER_HEAD(vt_notifier_list); int register_vt_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&vt_notifier_list, nb); } EXPORT_SYMBOL_GPL(register_vt_notifier); int unregister_vt_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&vt_notifier_list, nb); } EXPORT_SYMBOL_GPL(unregister_vt_notifier); static void notify_write(struct vc_data *vc, unsigned int unicode) { struct vt_notifier_param param = { .vc = vc, .c = unicode }; atomic_notifier_call_chain(&vt_notifier_list, VT_WRITE, &param); } static void notify_update(struct vc_data *vc) { struct vt_notifier_param param = { .vc = vc }; atomic_notifier_call_chain(&vt_notifier_list, VT_UPDATE, &param); } /* * Low-Level Functions */ static inline bool con_is_fg(const struct vc_data *vc) { return vc->vc_num == fg_console; } static inline bool con_should_update(const struct vc_data *vc) { return con_is_visible(vc) && !console_blanked; } static inline u16 *screenpos(const struct vc_data *vc, unsigned int offset, bool viewed) { unsigned long origin = viewed ? vc->vc_visible_origin : vc->vc_origin; return (u16 *)(origin + offset); } static void con_putc(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x) { if (vc->vc_sw->con_putc) vc->vc_sw->con_putc(vc, ca, y, x); else vc->vc_sw->con_putcs(vc, &ca, 1, y, x); } /* Called from the keyboard irq path.. */ static inline void scrolldelta(int lines) { /* FIXME */ /* scrolldelta needs some kind of consistency lock, but the BKL was and still is not protecting versus the scheduled back end */ scrollback_delta += lines; schedule_console_callback(); } void schedule_console_callback(void) { schedule_work(&console_work); } /* * Code to manage unicode-based screen buffers */ /* * Our screen buffer is preceded by an array of line pointers so that * scrolling only implies some pointer shuffling. */ static u32 **vc_uniscr_alloc(unsigned int cols, unsigned int rows) { u32 **uni_lines; void *p; unsigned int memsize, i, col_size = cols * sizeof(**uni_lines); /* allocate everything in one go */ memsize = col_size * rows; memsize += rows * sizeof(*uni_lines); uni_lines = vzalloc(memsize); if (!uni_lines) return NULL; /* initial line pointers */ p = uni_lines + rows; for (i = 0; i < rows; i++) { uni_lines[i] = p; p += col_size; } return uni_lines; } static void vc_uniscr_free(u32 **uni_lines) { vfree(uni_lines); } static void vc_uniscr_set(struct vc_data *vc, u32 **new_uni_lines) { vc_uniscr_free(vc->vc_uni_lines); vc->vc_uni_lines = new_uni_lines; } static void vc_uniscr_putc(struct vc_data *vc, u32 uc) { if (vc->vc_uni_lines) vc->vc_uni_lines[vc->state.y][vc->state.x] = uc; } static void vc_uniscr_insert(struct vc_data *vc, unsigned int nr) { if (vc->vc_uni_lines) { u32 *ln = vc->vc_uni_lines[vc->state.y]; unsigned int x = vc->state.x, cols = vc->vc_cols; memmove(&ln[x + nr], &ln[x], (cols - x - nr) * sizeof(*ln)); memset32(&ln[x], ' ', nr); } } static void vc_uniscr_delete(struct vc_data *vc, unsigned int nr) { if (vc->vc_uni_lines) { u32 *ln = vc->vc_uni_lines[vc->state.y]; unsigned int x = vc->state.x, cols = vc->vc_cols; memmove(&ln[x], &ln[x + nr], (cols - x - nr) * sizeof(*ln)); memset32(&ln[cols - nr], ' ', nr); } } static void vc_uniscr_clear_line(struct vc_data *vc, unsigned int x, unsigned int nr) { if (vc->vc_uni_lines) memset32(&vc->vc_uni_lines[vc->state.y][x], ' ', nr); } static void vc_uniscr_clear_lines(struct vc_data *vc, unsigned int y, unsigned int nr) { if (vc->vc_uni_lines) while (nr--) memset32(vc->vc_uni_lines[y++], ' ', vc->vc_cols); } /* juggling array rotation algorithm (complexity O(N), size complexity O(1)) */ static void juggle_array(u32 **array, unsigned int size, unsigned int nr) { unsigned int gcd_idx; for (gcd_idx = 0; gcd_idx < gcd(nr, size); gcd_idx++) { u32 *gcd_idx_val = array[gcd_idx]; unsigned int dst_idx = gcd_idx; while (1) { unsigned int src_idx = (dst_idx + nr) % size; if (src_idx == gcd_idx) break; array[dst_idx] = array[src_idx]; dst_idx = src_idx; } array[dst_idx] = gcd_idx_val; } } static void vc_uniscr_scroll(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int nr) { u32 **uni_lines = vc->vc_uni_lines; unsigned int size = bottom - top; if (!uni_lines) return; if (dir == SM_DOWN) { juggle_array(&uni_lines[top], size, size - nr); vc_uniscr_clear_lines(vc, top, nr); } else { juggle_array(&uni_lines[top], size, nr); vc_uniscr_clear_lines(vc, bottom - nr, nr); } } static void vc_uniscr_copy_area(u32 **dst_lines, unsigned int dst_cols, unsigned int dst_rows, u32 **src_lines, unsigned int src_cols, unsigned int src_top_row, unsigned int src_bot_row) { unsigned int dst_row = 0; if (!dst_lines) return; while (src_top_row < src_bot_row) { u32 *src_line = src_lines[src_top_row]; u32 *dst_line = dst_lines[dst_row]; memcpy(dst_line, src_line, src_cols * sizeof(*src_line)); if (dst_cols - src_cols) memset32(dst_line + src_cols, ' ', dst_cols - src_cols); src_top_row++; dst_row++; } while (dst_row < dst_rows) { u32 *dst_line = dst_lines[dst_row]; memset32(dst_line, ' ', dst_cols); dst_row++; } } /* * Called from vcs_read() to make sure unicode screen retrieval is possible. * This will initialize the unicode screen buffer if not already done. * This returns 0 if OK, or a negative error code otherwise. * In particular, -ENODATA is returned if the console is not in UTF-8 mode. */ int vc_uniscr_check(struct vc_data *vc) { u32 **uni_lines; unsigned short *p; int x, y, mask; WARN_CONSOLE_UNLOCKED(); if (!vc->vc_utf) return -ENODATA; if (vc->vc_uni_lines) return 0; uni_lines = vc_uniscr_alloc(vc->vc_cols, vc->vc_rows); if (!uni_lines) return -ENOMEM; /* * Let's populate it initially with (imperfect) reverse translation. * This is the next best thing we can do short of having it enabled * from the start even when no users rely on this functionality. True * unicode content will be available after a complete screen refresh. */ p = (unsigned short *)vc->vc_origin; mask = vc->vc_hi_font_mask | 0xff; for (y = 0; y < vc->vc_rows; y++) { u32 *line = uni_lines[y]; for (x = 0; x < vc->vc_cols; x++) { u16 glyph = scr_readw(p++) & mask; line[x] = inverse_translate(vc, glyph, true); } } vc->vc_uni_lines = uni_lines; return 0; } /* * Called from vcs_read() to get the unicode data from the screen. * This must be preceded by a successful call to vc_uniscr_check() once * the console lock has been taken. */ void vc_uniscr_copy_line(const struct vc_data *vc, void *dest, bool viewed, unsigned int row, unsigned int col, unsigned int nr) { u32 **uni_lines = vc->vc_uni_lines; int offset = row * vc->vc_size_row + col * 2; unsigned long pos; if (WARN_ON_ONCE(!uni_lines)) return; pos = (unsigned long)screenpos(vc, offset, viewed); if (pos >= vc->vc_origin && pos < vc->vc_scr_end) { /* * Desired position falls in the main screen buffer. * However the actual row/col might be different if * scrollback is active. */ row = (pos - vc->vc_origin) / vc->vc_size_row; col = ((pos - vc->vc_origin) % vc->vc_size_row) / 2; memcpy(dest, &uni_lines[row][col], nr * sizeof(u32)); } else { /* * Scrollback is active. For now let's simply backtranslate * the screen glyphs until the unicode screen buffer does * synchronize with console display drivers for a scrollback * buffer of its own. */ u16 *p = (u16 *)pos; int mask = vc->vc_hi_font_mask | 0xff; u32 *uni_buf = dest; while (nr--) { u16 glyph = scr_readw(p++) & mask; *uni_buf++ = inverse_translate(vc, glyph, true); } } } static void con_scroll(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int nr) { unsigned int rows = bottom - top; u16 *clear, *dst, *src; if (top + nr >= bottom) nr = rows - 1; if (bottom > vc->vc_rows || top >= bottom || nr < 1) return; vc_uniscr_scroll(vc, top, bottom, dir, nr); if (con_is_visible(vc) && vc->vc_sw->con_scroll(vc, top, bottom, dir, nr)) return; src = clear = (u16 *)(vc->vc_origin + vc->vc_size_row * top); dst = (u16 *)(vc->vc_origin + vc->vc_size_row * (top + nr)); if (dir == SM_UP) { clear = src + (rows - nr) * vc->vc_cols; swap(src, dst); } scr_memmovew(dst, src, (rows - nr) * vc->vc_size_row); scr_memsetw(clear, vc->vc_video_erase_char, vc->vc_size_row * nr); } static void do_update_region(struct vc_data *vc, unsigned long start, int count) { unsigned int xx, yy, offset; u16 *p = (u16 *)start; offset = (start - vc->vc_origin) / 2; xx = offset % vc->vc_cols; yy = offset / vc->vc_cols; for(;;) { u16 attrib = scr_readw(p) & 0xff00; int startx = xx; u16 *q = p; while (xx < vc->vc_cols && count) { if (attrib != (scr_readw(p) & 0xff00)) { if (p > q) vc->vc_sw->con_putcs(vc, q, p-q, yy, startx); startx = xx; q = p; attrib = scr_readw(p) & 0xff00; } p++; xx++; count--; } if (p > q) vc->vc_sw->con_putcs(vc, q, p-q, yy, startx); if (!count) break; xx = 0; yy++; } } void update_region(struct vc_data *vc, unsigned long start, int count) { WARN_CONSOLE_UNLOCKED(); if (con_should_update(vc)) { hide_cursor(vc); do_update_region(vc, start, count); set_cursor(vc); } } EXPORT_SYMBOL(update_region); /* Structure of attributes is hardware-dependent */ static u8 build_attr(struct vc_data *vc, u8 _color, enum vc_intensity _intensity, bool _blink, bool _underline, bool _reverse, bool _italic) { if (vc->vc_sw->con_build_attr) return vc->vc_sw->con_build_attr(vc, _color, _intensity, _blink, _underline, _reverse, _italic); /* * ++roman: I completely changed the attribute format for monochrome * mode (!can_do_color). The formerly used MDA (monochrome display * adapter) format didn't allow the combination of certain effects. * Now the attribute is just a bit vector: * Bit 0..1: intensity (0..2) * Bit 2 : underline * Bit 3 : reverse * Bit 7 : blink */ { u8 a = _color; if (!vc->vc_can_do_color) return _intensity | (_italic << 1) | (_underline << 2) | (_reverse << 3) | (_blink << 7); if (_italic) a = (a & 0xF0) | vc->vc_itcolor; else if (_underline) a = (a & 0xf0) | vc->vc_ulcolor; else if (_intensity == VCI_HALF_BRIGHT) a = (a & 0xf0) | vc->vc_halfcolor; if (_reverse) a = (a & 0x88) | (((a >> 4) | (a << 4)) & 0x77); if (_blink) a ^= 0x80; if (_intensity == VCI_BOLD) a ^= 0x08; if (vc->vc_hi_font_mask == 0x100) a <<= 1; return a; } } static void update_attr(struct vc_data *vc) { vc->vc_attr = build_attr(vc, vc->state.color, vc->state.intensity, vc->state.blink, vc->state.underline, vc->state.reverse ^ vc->vc_decscnm, vc->state.italic); vc->vc_video_erase_char = ' ' | (build_attr(vc, vc->state.color, VCI_NORMAL, vc->state.blink, false, vc->vc_decscnm, false) << 8); } /* Note: inverting the screen twice should revert to the original state */ void invert_screen(struct vc_data *vc, int offset, int count, bool viewed) { u16 *p; WARN_CONSOLE_UNLOCKED(); count /= 2; p = screenpos(vc, offset, viewed); if (vc->vc_sw->con_invert_region) { vc->vc_sw->con_invert_region(vc, p, count); } else { u16 *q = p; int cnt = count; u16 a; if (!vc->vc_can_do_color) { while (cnt--) { a = scr_readw(q); a ^= 0x0800; scr_writew(a, q); q++; } } else if (vc->vc_hi_font_mask == 0x100) { while (cnt--) { a = scr_readw(q); a = (a & 0x11ff) | ((a & 0xe000) >> 4) | ((a & 0x0e00) << 4); scr_writew(a, q); q++; } } else { while (cnt--) { a = scr_readw(q); a = (a & 0x88ff) | ((a & 0x7000) >> 4) | ((a & 0x0700) << 4); scr_writew(a, q); q++; } } } if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, count); notify_update(vc); } /* used by selection: complement pointer position */ void complement_pos(struct vc_data *vc, int offset) { static int old_offset = -1; static unsigned short old; static unsigned short oldx, oldy; WARN_CONSOLE_UNLOCKED(); if (old_offset != -1 && old_offset >= 0 && old_offset < vc->vc_screenbuf_size) { scr_writew(old, screenpos(vc, old_offset, true)); if (con_should_update(vc)) con_putc(vc, old, oldy, oldx); notify_update(vc); } old_offset = offset; if (offset != -1 && offset >= 0 && offset < vc->vc_screenbuf_size) { unsigned short new; u16 *p = screenpos(vc, offset, true); old = scr_readw(p); new = old ^ vc->vc_complement_mask; scr_writew(new, p); if (con_should_update(vc)) { oldx = (offset >> 1) % vc->vc_cols; oldy = (offset >> 1) / vc->vc_cols; con_putc(vc, new, oldy, oldx); } notify_update(vc); } } static void insert_char(struct vc_data *vc, unsigned int nr) { unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_insert(vc, nr); scr_memmovew(p + nr, p, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, vc->vc_cols - vc->state.x); } static void delete_char(struct vc_data *vc, unsigned int nr) { unsigned short *p = (unsigned short *) vc->vc_pos; vc_uniscr_delete(vc, nr); scr_memmovew(p, p + nr, (vc->vc_cols - vc->state.x - nr) * 2); scr_memsetw(p + vc->vc_cols - vc->state.x - nr, vc->vc_video_erase_char, nr * 2); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long) p, vc->vc_cols - vc->state.x); } static int softcursor_original = -1; static void add_softcursor(struct vc_data *vc) { int i = scr_readw((u16 *) vc->vc_pos); u32 type = vc->vc_cursor_type; if (!(type & CUR_SW)) return; if (softcursor_original != -1) return; softcursor_original = i; i |= CUR_SET(type); i ^= CUR_CHANGE(type); if ((type & CUR_ALWAYS_BG) && (softcursor_original & CUR_BG) == (i & CUR_BG)) i ^= CUR_BG; if ((type & CUR_INVERT_FG_BG) && (i & CUR_FG) == ((i & CUR_BG) >> 4)) i ^= CUR_FG; scr_writew(i, (u16 *)vc->vc_pos); if (con_should_update(vc)) con_putc(vc, i, vc->state.y, vc->state.x); } static void hide_softcursor(struct vc_data *vc) { if (softcursor_original != -1) { scr_writew(softcursor_original, (u16 *)vc->vc_pos); if (con_should_update(vc)) con_putc(vc, softcursor_original, vc->state.y, vc->state.x); softcursor_original = -1; } } static void hide_cursor(struct vc_data *vc) { if (vc_is_sel(vc)) clear_selection(); vc->vc_sw->con_cursor(vc, false); hide_softcursor(vc); } static void set_cursor(struct vc_data *vc) { if (!con_is_fg(vc) || console_blanked || vc->vc_mode == KD_GRAPHICS) return; if (vc->vc_deccm) { if (vc_is_sel(vc)) clear_selection(); add_softcursor(vc); if (CUR_SIZE(vc->vc_cursor_type) != CUR_NONE) vc->vc_sw->con_cursor(vc, true); } else hide_cursor(vc); } static void set_origin(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (!con_is_visible(vc) || !vc->vc_sw->con_set_origin || !vc->vc_sw->con_set_origin(vc)) vc->vc_origin = (unsigned long)vc->vc_screenbuf; vc->vc_visible_origin = vc->vc_origin; vc->vc_scr_end = vc->vc_origin + vc->vc_screenbuf_size; vc->vc_pos = vc->vc_origin + vc->vc_size_row * vc->state.y + 2 * vc->state.x; } static void save_screen(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (vc->vc_sw->con_save_screen) vc->vc_sw->con_save_screen(vc); } static void flush_scrollback(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); set_origin(vc); if (!con_is_visible(vc)) return; /* * The legacy way for flushing the scrollback buffer is to use a side * effect of the con_switch method. We do it only on the foreground * console as background consoles have no scrollback buffers in that * case and we obviously don't want to switch to them. */ hide_cursor(vc); vc->vc_sw->con_switch(vc); set_cursor(vc); } /* * Redrawing of screen */ void clear_buffer_attributes(struct vc_data *vc) { unsigned short *p = (unsigned short *)vc->vc_origin; int count = vc->vc_screenbuf_size / 2; int mask = vc->vc_hi_font_mask | 0xff; for (; count > 0; count--, p++) { scr_writew((scr_readw(p)&mask) | (vc->vc_video_erase_char & ~mask), p); } } void redraw_screen(struct vc_data *vc, int is_switch) { int redraw = 0; WARN_CONSOLE_UNLOCKED(); if (!vc) { /* strange ... */ /* printk("redraw_screen: tty %d not allocated ??\n", new_console+1); */ return; } if (is_switch) { struct vc_data *old_vc = vc_cons[fg_console].d; if (old_vc == vc) return; if (!con_is_visible(vc)) redraw = 1; *vc->vc_display_fg = vc; fg_console = vc->vc_num; hide_cursor(old_vc); if (!con_is_visible(old_vc)) { save_screen(old_vc); set_origin(old_vc); } if (tty0dev) sysfs_notify(&tty0dev->kobj, NULL, "active"); } else { hide_cursor(vc); redraw = 1; } if (redraw) { bool update; int old_was_color = vc->vc_can_do_color; set_origin(vc); update = vc->vc_sw->con_switch(vc); set_palette(vc); /* * If console changed from mono<->color, the best we can do * is to clear the buffer attributes. As it currently stands, * rebuilding new attributes from the old buffer is not doable * without overly complex code. */ if (old_was_color != vc->vc_can_do_color) { update_attr(vc); clear_buffer_attributes(vc); } if (update && vc->vc_mode != KD_GRAPHICS) do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); } set_cursor(vc); if (is_switch) { vt_set_leds_compute_shiftstate(); notify_update(vc); } } EXPORT_SYMBOL(redraw_screen); /* * Allocation, freeing and resizing of VTs. */ int vc_cons_allocated(unsigned int i) { return (i < MAX_NR_CONSOLES && vc_cons[i].d); } static void visual_init(struct vc_data *vc, int num, bool init) { /* ++Geert: vc->vc_sw->con_init determines console size */ if (vc->vc_sw) module_put(vc->vc_sw->owner); vc->vc_sw = conswitchp; if (con_driver_map[num]) vc->vc_sw = con_driver_map[num]; __module_get(vc->vc_sw->owner); vc->vc_num = num; vc->vc_display_fg = &master_display_fg; if (vc->uni_pagedict_loc) con_free_unimap(vc); vc->uni_pagedict_loc = &vc->uni_pagedict; vc->uni_pagedict = NULL; vc->vc_hi_font_mask = 0; vc->vc_complement_mask = 0; vc->vc_can_do_color = 0; vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; vc->vc_sw->con_init(vc, init); if (!vc->vc_complement_mask) vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800; vc->vc_s_complement_mask = vc->vc_complement_mask; vc->vc_size_row = vc->vc_cols << 1; vc->vc_screenbuf_size = vc->vc_rows * vc->vc_size_row; } static void visual_deinit(struct vc_data *vc) { vc->vc_sw->con_deinit(vc); module_put(vc->vc_sw->owner); } static void vc_port_destruct(struct tty_port *port) { struct vc_data *vc = container_of(port, struct vc_data, port); kfree(vc); } static const struct tty_port_operations vc_port_ops = { .destruct = vc_port_destruct, }; /* * Change # of rows and columns (0 means unchanged/the size of fg_console) * [this is to be used together with some user program * like resize that changes the hardware videomode] */ #define VC_MAXCOL (32767) #define VC_MAXROW (32767) int vc_allocate(unsigned int currcons) /* return 0 on success */ { struct vt_notifier_param param; struct vc_data *vc; int err; WARN_CONSOLE_UNLOCKED(); if (currcons >= MAX_NR_CONSOLES) return -ENXIO; if (vc_cons[currcons].d) return 0; /* due to the granularity of kmalloc, we waste some memory here */ /* the alloc is done in two steps, to optimize the common situation of a 25x80 console (structsize=216, screenbuf_size=4000) */ /* although the numbers above are not valid since long ago, the point is still up-to-date and the comment still has its value even if only as a historical artifact. --mj, July 1998 */ param.vc = vc = kzalloc(sizeof(struct vc_data), GFP_KERNEL); if (!vc) return -ENOMEM; vc_cons[currcons].d = vc; tty_port_init(&vc->port); vc->port.ops = &vc_port_ops; INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); visual_init(vc, currcons, true); if (!*vc->uni_pagedict_loc) con_set_default_unimap(vc); err = -EINVAL; if (vc->vc_cols > VC_MAXCOL || vc->vc_rows > VC_MAXROW || vc->vc_screenbuf_size > KMALLOC_MAX_SIZE || !vc->vc_screenbuf_size) goto err_free; err = -ENOMEM; vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_KERNEL); if (!vc->vc_screenbuf) goto err_free; /* If no drivers have overridden us and the user didn't pass a boot option, default to displaying the cursor */ if (global_cursor_default == -1) global_cursor_default = 1; vc_init(vc, 1); vcs_make_sysfs(currcons); atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, &param); return 0; err_free: visual_deinit(vc); kfree(vc); vc_cons[currcons].d = NULL; return err; } static inline int resize_screen(struct vc_data *vc, int width, int height, bool from_user) { /* Resizes the resolution of the display adapater */ int err = 0; if (vc->vc_sw->con_resize) err = vc->vc_sw->con_resize(vc, width, height, from_user); return err; } /** * vc_do_resize - resizing method for the tty * @tty: tty being resized * @vc: virtual console private data * @cols: columns * @lines: lines * @from_user: invoked by a user? * * Resize a virtual console, clipping according to the actual constraints. If * the caller passes a tty structure then update the termios winsize * information and perform any necessary signal handling. * * Locking: Caller must hold the console semaphore. Takes the termios rwsem and * ctrl.lock of the tty IFF a tty is passed. */ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc, unsigned int cols, unsigned int lines, bool from_user) { unsigned long old_origin, new_origin, new_scr_end, rlth, rrem, err = 0; unsigned long end; unsigned int old_rows, old_row_size, first_copied_row; unsigned int new_cols, new_rows, new_row_size, new_screen_size; unsigned short *oldscreen, *newscreen; u32 **new_uniscr = NULL; WARN_CONSOLE_UNLOCKED(); if (cols > VC_MAXCOL || lines > VC_MAXROW) return -EINVAL; new_cols = (cols ? cols : vc->vc_cols); new_rows = (lines ? lines : vc->vc_rows); new_row_size = new_cols << 1; new_screen_size = new_row_size * new_rows; if (new_cols == vc->vc_cols && new_rows == vc->vc_rows) { /* * This function is being called here to cover the case * where the userspace calls the FBIOPUT_VSCREENINFO twice, * passing the same fb_var_screeninfo containing the fields * yres/xres equal to a number non-multiple of vc_font.height * and yres_virtual/xres_virtual equal to number lesser than the * vc_font.height and yres/xres. * In the second call, the struct fb_var_screeninfo isn't * being modified by the underlying driver because of the * if above, and this causes the fbcon_display->vrows to become * negative and it eventually leads to out-of-bound * access by the imageblit function. * To give the correct values to the struct and to not have * to deal with possible errors from the code below, we call * the resize_screen here as well. */ return resize_screen(vc, new_cols, new_rows, from_user); } if (new_screen_size > KMALLOC_MAX_SIZE || !new_screen_size) return -EINVAL; newscreen = kzalloc(new_screen_size, GFP_USER); if (!newscreen) return -ENOMEM; if (vc->vc_uni_lines) { new_uniscr = vc_uniscr_alloc(new_cols, new_rows); if (!new_uniscr) { kfree(newscreen); return -ENOMEM; } } if (vc_is_sel(vc)) clear_selection(); old_rows = vc->vc_rows; old_row_size = vc->vc_size_row; err = resize_screen(vc, new_cols, new_rows, from_user); if (err) { kfree(newscreen); vc_uniscr_free(new_uniscr); return err; } vc->vc_rows = new_rows; vc->vc_cols = new_cols; vc->vc_size_row = new_row_size; vc->vc_screenbuf_size = new_screen_size; rlth = min(old_row_size, new_row_size); rrem = new_row_size - rlth; old_origin = vc->vc_origin; new_origin = (long) newscreen; new_scr_end = new_origin + new_screen_size; if (vc->state.y > new_rows) { if (old_rows - vc->state.y < new_rows) { /* * Cursor near the bottom, copy contents from the * bottom of buffer */ first_copied_row = (old_rows - new_rows); } else { /* * Cursor is in no man's land, copy 1/2 screenful * from the top and bottom of cursor position */ first_copied_row = (vc->state.y - new_rows/2); } old_origin += first_copied_row * old_row_size; } else first_copied_row = 0; end = old_origin + old_row_size * min(old_rows, new_rows); vc_uniscr_copy_area(new_uniscr, new_cols, new_rows, vc->vc_uni_lines, rlth/2, first_copied_row, min(old_rows, new_rows)); vc_uniscr_set(vc, new_uniscr); update_attr(vc); while (old_origin < end) { scr_memcpyw((unsigned short *) new_origin, (unsigned short *) old_origin, rlth); if (rrem) scr_memsetw((void *)(new_origin + rlth), vc->vc_video_erase_char, rrem); old_origin += old_row_size; new_origin += new_row_size; } if (new_scr_end > new_origin) scr_memsetw((void *)new_origin, vc->vc_video_erase_char, new_scr_end - new_origin); oldscreen = vc->vc_screenbuf; vc->vc_screenbuf = newscreen; vc->vc_screenbuf_size = new_screen_size; set_origin(vc); kfree(oldscreen); /* do part of a reset_terminal() */ vc->vc_top = 0; vc->vc_bottom = vc->vc_rows; gotoxy(vc, vc->state.x, vc->state.y); save_cur(vc); if (tty) { /* Rewrite the requested winsize data with the actual resulting sizes */ struct winsize ws; memset(&ws, 0, sizeof(ws)); ws.ws_row = vc->vc_rows; ws.ws_col = vc->vc_cols; ws.ws_ypixel = vc->vc_scan_lines; tty_do_resize(tty, &ws); } if (con_is_visible(vc)) update_screen(vc); vt_event_post(VT_EVENT_RESIZE, vc->vc_num, vc->vc_num); notify_update(vc); return err; } /** * __vc_resize - resize a VT * @vc: virtual console * @cols: columns * @rows: rows * @from_user: invoked by a user? * * Resize a virtual console as seen from the console end of things. We use the * common vc_do_resize() method to update the structures. * * Locking: The caller must hold the console sem to protect console internals * and @vc->port.tty. */ int __vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows, bool from_user) { return vc_do_resize(vc->port.tty, vc, cols, rows, from_user); } EXPORT_SYMBOL(__vc_resize); /** * vt_resize - resize a VT * @tty: tty to resize * @ws: winsize attributes * * Resize a virtual terminal. This is called by the tty layer as we register * our own handler for resizing. The mutual helper does all the actual work. * * Locking: Takes the console sem and the called methods then take the tty * termios_rwsem and the tty ctrl.lock in that order. */ static int vt_resize(struct tty_struct *tty, struct winsize *ws) { struct vc_data *vc = tty->driver_data; int ret; console_lock(); ret = vc_do_resize(tty, vc, ws->ws_col, ws->ws_row, false); console_unlock(); return ret; } struct vc_data *vc_deallocate(unsigned int currcons) { struct vc_data *vc = NULL; WARN_CONSOLE_UNLOCKED(); if (vc_cons_allocated(currcons)) { struct vt_notifier_param param; param.vc = vc = vc_cons[currcons].d; atomic_notifier_call_chain(&vt_notifier_list, VT_DEALLOCATE, &param); vcs_remove_sysfs(currcons); visual_deinit(vc); con_free_unimap(vc); put_pid(vc->vt_pid); vc_uniscr_set(vc, NULL); kfree(vc->vc_screenbuf); vc_cons[currcons].d = NULL; } return vc; } /* * VT102 emulator */ enum { EPecma = 0, EPdec, EPeq, EPgt, EPlt}; #define set_kbd(vc, x) vt_set_kbd_mode_bit((vc)->vc_num, (x)) #define clr_kbd(vc, x) vt_clr_kbd_mode_bit((vc)->vc_num, (x)) #define is_kbd(vc, x) vt_get_kbd_mode_bit((vc)->vc_num, (x)) #define decarm VC_REPEAT #define decckm VC_CKMODE #define kbdapplic VC_APPLIC #define lnm VC_CRLF const unsigned char color_table[] = { 0, 4, 2, 6, 1, 5, 3, 7, 8,12,10,14, 9,13,11,15 }; EXPORT_SYMBOL(color_table); /* the default colour table, for VGA+ colour systems */ unsigned char default_red[] = { 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x00, 0xaa, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff, 0x55, 0xff }; module_param_array(default_red, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_red); unsigned char default_grn[] = { 0x00, 0x00, 0xaa, 0x55, 0x00, 0x00, 0xaa, 0xaa, 0x55, 0x55, 0xff, 0xff, 0x55, 0x55, 0xff, 0xff }; module_param_array(default_grn, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_grn); unsigned char default_blu[] = { 0x00, 0x00, 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0x55, 0x55, 0x55, 0x55, 0xff, 0xff, 0xff, 0xff }; module_param_array(default_blu, byte, NULL, S_IRUGO | S_IWUSR); EXPORT_SYMBOL(default_blu); /* * gotoxy() must verify all boundaries, because the arguments * might also be negative. If the given position is out of * bounds, the cursor is placed at the nearest margin. */ static void gotoxy(struct vc_data *vc, int new_x, int new_y) { int min_y, max_y; if (new_x < 0) vc->state.x = 0; else { if (new_x >= vc->vc_cols) vc->state.x = vc->vc_cols - 1; else vc->state.x = new_x; } if (vc->vc_decom) { min_y = vc->vc_top; max_y = vc->vc_bottom; } else { min_y = 0; max_y = vc->vc_rows; } if (new_y < min_y) vc->state.y = min_y; else if (new_y >= max_y) vc->state.y = max_y - 1; else vc->state.y = new_y; vc->vc_pos = vc->vc_origin + vc->state.y * vc->vc_size_row + (vc->state.x << 1); vc->vc_need_wrap = 0; } /* for absolute user moves, when decom is set */ static void gotoxay(struct vc_data *vc, int new_x, int new_y) { gotoxy(vc, new_x, vc->vc_decom ? (vc->vc_top + new_y) : new_y); } void scrollback(struct vc_data *vc) { scrolldelta(-(vc->vc_rows / 2)); } void scrollfront(struct vc_data *vc, int lines) { if (!lines) lines = vc->vc_rows / 2; scrolldelta(lines); } static void lf(struct vc_data *vc) { /* don't scroll if above bottom of scrolling region, or * if below scrolling region */ if (vc->state.y + 1 == vc->vc_bottom) con_scroll(vc, vc->vc_top, vc->vc_bottom, SM_UP, 1); else if (vc->state.y < vc->vc_rows - 1) { vc->state.y++; vc->vc_pos += vc->vc_size_row; } vc->vc_need_wrap = 0; notify_write(vc, '\n'); } static void ri(struct vc_data *vc) { /* don't scroll if below top of scrolling region, or * if above scrolling region */ if (vc->state.y == vc->vc_top) con_scroll(vc, vc->vc_top, vc->vc_bottom, SM_DOWN, 1); else if (vc->state.y > 0) { vc->state.y--; vc->vc_pos -= vc->vc_size_row; } vc->vc_need_wrap = 0; } static inline void cr(struct vc_data *vc) { vc->vc_pos -= vc->state.x << 1; vc->vc_need_wrap = vc->state.x = 0; notify_write(vc, '\r'); } static inline void bs(struct vc_data *vc) { if (vc->state.x) { vc->vc_pos -= 2; vc->state.x--; vc->vc_need_wrap = 0; notify_write(vc, '\b'); } } static inline void del(struct vc_data *vc) { /* ignored */ } enum CSI_J { CSI_J_CURSOR_TO_END = 0, CSI_J_START_TO_CURSOR = 1, CSI_J_VISIBLE = 2, CSI_J_FULL = 3, }; static void csi_J(struct vc_data *vc, enum CSI_J vpar) { unsigned short *start; unsigned int count; switch (vpar) { case CSI_J_CURSOR_TO_END: vc_uniscr_clear_line(vc, vc->state.x, vc->vc_cols - vc->state.x); vc_uniscr_clear_lines(vc, vc->state.y + 1, vc->vc_rows - vc->state.y - 1); count = (vc->vc_scr_end - vc->vc_pos) >> 1; start = (unsigned short *)vc->vc_pos; break; case CSI_J_START_TO_CURSOR: vc_uniscr_clear_line(vc, 0, vc->state.x + 1); vc_uniscr_clear_lines(vc, 0, vc->state.y); count = ((vc->vc_pos - vc->vc_origin) >> 1) + 1; start = (unsigned short *)vc->vc_origin; break; case CSI_J_FULL: flush_scrollback(vc); fallthrough; case CSI_J_VISIBLE: vc_uniscr_clear_lines(vc, 0, vc->vc_rows); count = vc->vc_cols * vc->vc_rows; start = (unsigned short *)vc->vc_origin; break; default: return; } scr_memsetw(start, vc->vc_video_erase_char, 2 * count); if (con_should_update(vc)) do_update_region(vc, (unsigned long) start, count); vc->vc_need_wrap = 0; } enum { CSI_K_CURSOR_TO_LINEEND = 0, CSI_K_LINESTART_TO_CURSOR = 1, CSI_K_LINE = 2, }; static void csi_K(struct vc_data *vc) { unsigned int count; unsigned short *start = (unsigned short *)vc->vc_pos; int offset; switch (vc->vc_par[0]) { case CSI_K_CURSOR_TO_LINEEND: offset = 0; count = vc->vc_cols - vc->state.x; break; case CSI_K_LINESTART_TO_CURSOR: offset = -vc->state.x; count = vc->state.x + 1; break; case CSI_K_LINE: offset = -vc->state.x; count = vc->vc_cols; break; default: return; } vc_uniscr_clear_line(vc, vc->state.x + offset, count); scr_memsetw(start + offset, vc->vc_video_erase_char, 2 * count); vc->vc_need_wrap = 0; if (con_should_update(vc)) do_update_region(vc, (unsigned long)(start + offset), count); } /* erase the following count positions */ static void csi_X(struct vc_data *vc) { /* not vt100? */ unsigned int count = clamp(vc->vc_par[0], 1, vc->vc_cols - vc->state.x); vc_uniscr_clear_line(vc, vc->state.x, count); scr_memsetw((unsigned short *)vc->vc_pos, vc->vc_video_erase_char, 2 * count); if (con_should_update(vc)) vc->vc_sw->con_clear(vc, vc->state.y, vc->state.x, count); vc->vc_need_wrap = 0; } static void default_attr(struct vc_data *vc) { vc->state.intensity = VCI_NORMAL; vc->state.italic = false; vc->state.underline = false; vc->state.reverse = false; vc->state.blink = false; vc->state.color = vc->vc_def_color; } struct rgb { u8 r; u8 g; u8 b; }; static void rgb_from_256(unsigned int i, struct rgb *c) { if (i < 8) { /* Standard colours. */ c->r = i&1 ? 0xaa : 0x00; c->g = i&2 ? 0xaa : 0x00; c->b = i&4 ? 0xaa : 0x00; } else if (i < 16) { c->r = i&1 ? 0xff : 0x55; c->g = i&2 ? 0xff : 0x55; c->b = i&4 ? 0xff : 0x55; } else if (i < 232) { /* 6x6x6 colour cube. */ i -= 16; c->b = i % 6 * 255 / 6; i /= 6; c->g = i % 6 * 255 / 6; i /= 6; c->r = i * 255 / 6; } else /* Grayscale ramp. */ c->r = c->g = c->b = i * 10 - 2312; } static void rgb_foreground(struct vc_data *vc, const struct rgb *c) { u8 hue = 0, max = max3(c->r, c->g, c->b); if (c->r > max / 2) hue |= 4; if (c->g > max / 2) hue |= 2; if (c->b > max / 2) hue |= 1; if (hue == 7 && max <= 0x55) { hue = 0; vc->state.intensity = VCI_BOLD; } else if (max > 0xaa) vc->state.intensity = VCI_BOLD; else vc->state.intensity = VCI_NORMAL; vc->state.color = (vc->state.color & 0xf0) | hue; } static void rgb_background(struct vc_data *vc, const struct rgb *c) { /* For backgrounds, err on the dark side. */ vc->state.color = (vc->state.color & 0x0f) | (c->r&0x80) >> 1 | (c->g&0x80) >> 2 | (c->b&0x80) >> 3; } /* * ITU T.416 Higher colour modes. They break the usual properties of SGR codes * and thus need to be detected and ignored by hand. That standard also * wants : rather than ; as separators but sequences containing : are currently * completely ignored by the parser. * * Subcommands 3 (CMY) and 4 (CMYK) are so insane there's no point in * supporting them. */ static int vc_t416_color(struct vc_data *vc, int i, void(*set_color)(struct vc_data *vc, const struct rgb *c)) { struct rgb c; i++; if (i > vc->vc_npar) return i; if (vc->vc_par[i] == 5 && i + 1 <= vc->vc_npar) { /* 256 colours */ i++; rgb_from_256(vc->vc_par[i], &c); } else if (vc->vc_par[i] == 2 && i + 3 <= vc->vc_npar) { /* 24 bit */ c.r = vc->vc_par[i + 1]; c.g = vc->vc_par[i + 2]; c.b = vc->vc_par[i + 3]; i += 3; } else return i; set_color(vc, &c); return i; } enum { CSI_m_DEFAULT = 0, CSI_m_BOLD = 1, CSI_m_HALF_BRIGHT = 2, CSI_m_ITALIC = 3, CSI_m_UNDERLINE = 4, CSI_m_BLINK = 5, CSI_m_REVERSE = 7, CSI_m_PRI_FONT = 10, CSI_m_ALT_FONT1 = 11, CSI_m_ALT_FONT2 = 12, CSI_m_DOUBLE_UNDERLINE = 21, CSI_m_NORMAL_INTENSITY = 22, CSI_m_NO_ITALIC = 23, CSI_m_NO_UNDERLINE = 24, CSI_m_NO_BLINK = 25, CSI_m_NO_REVERSE = 27, CSI_m_FG_COLOR_BEG = 30, CSI_m_FG_COLOR_END = 37, CSI_m_FG_COLOR = 38, CSI_m_DEFAULT_FG_COLOR = 39, CSI_m_BG_COLOR_BEG = 40, CSI_m_BG_COLOR_END = 47, CSI_m_BG_COLOR = 48, CSI_m_DEFAULT_BG_COLOR = 49, CSI_m_BRIGHT_FG_COLOR_BEG = 90, CSI_m_BRIGHT_FG_COLOR_END = 97, CSI_m_BRIGHT_FG_COLOR_OFF = CSI_m_BRIGHT_FG_COLOR_BEG - CSI_m_FG_COLOR_BEG, CSI_m_BRIGHT_BG_COLOR_BEG = 100, CSI_m_BRIGHT_BG_COLOR_END = 107, CSI_m_BRIGHT_BG_COLOR_OFF = CSI_m_BRIGHT_BG_COLOR_BEG - CSI_m_BG_COLOR_BEG, }; /* console_lock is held */ static void csi_m(struct vc_data *vc) { int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { case CSI_m_DEFAULT: /* all attributes off */ default_attr(vc); break; case CSI_m_BOLD: vc->state.intensity = VCI_BOLD; break; case CSI_m_HALF_BRIGHT: vc->state.intensity = VCI_HALF_BRIGHT; break; case CSI_m_ITALIC: vc->state.italic = true; break; case CSI_m_DOUBLE_UNDERLINE: /* * No console drivers support double underline, so * convert it to a single underline. */ case CSI_m_UNDERLINE: vc->state.underline = true; break; case CSI_m_BLINK: vc->state.blink = true; break; case CSI_m_REVERSE: vc->state.reverse = true; break; case CSI_m_PRI_FONT: /* ANSI X3.64-1979 (SCO-ish?) * Select primary font, don't display control chars if * defined, don't set bit 8 on output. */ vc->vc_translate = set_translate(vc->state.Gx_charset[vc->state.charset], vc); vc->vc_disp_ctrl = 0; vc->vc_toggle_meta = 0; break; case CSI_m_ALT_FONT1: /* ANSI X3.64-1979 (SCO-ish?) * Select first alternate font, lets chars < 32 be * displayed as ROM chars. */ vc->vc_translate = set_translate(IBMPC_MAP, vc); vc->vc_disp_ctrl = 1; vc->vc_toggle_meta = 0; break; case CSI_m_ALT_FONT2: /* ANSI X3.64-1979 (SCO-ish?) * Select second alternate font, toggle high bit * before displaying as ROM char. */ vc->vc_translate = set_translate(IBMPC_MAP, vc); vc->vc_disp_ctrl = 1; vc->vc_toggle_meta = 1; break; case CSI_m_NORMAL_INTENSITY: vc->state.intensity = VCI_NORMAL; break; case CSI_m_NO_ITALIC: vc->state.italic = false; break; case CSI_m_NO_UNDERLINE: vc->state.underline = false; break; case CSI_m_NO_BLINK: vc->state.blink = false; break; case CSI_m_NO_REVERSE: vc->state.reverse = false; break; case CSI_m_FG_COLOR: i = vc_t416_color(vc, i, rgb_foreground); break; case CSI_m_BG_COLOR: i = vc_t416_color(vc, i, rgb_background); break; case CSI_m_DEFAULT_FG_COLOR: vc->state.color = (vc->vc_def_color & 0x0f) | (vc->state.color & 0xf0); break; case CSI_m_DEFAULT_BG_COLOR: vc->state.color = (vc->vc_def_color & 0xf0) | (vc->state.color & 0x0f); break; case CSI_m_BRIGHT_FG_COLOR_BEG ... CSI_m_BRIGHT_FG_COLOR_END: vc->state.intensity = VCI_BOLD; vc->vc_par[i] -= CSI_m_BRIGHT_FG_COLOR_OFF; fallthrough; case CSI_m_FG_COLOR_BEG ... CSI_m_FG_COLOR_END: vc->vc_par[i] -= CSI_m_FG_COLOR_BEG; vc->state.color = color_table[vc->vc_par[i]] | (vc->state.color & 0xf0); break; case CSI_m_BRIGHT_BG_COLOR_BEG ... CSI_m_BRIGHT_BG_COLOR_END: vc->vc_par[i] -= CSI_m_BRIGHT_BG_COLOR_OFF; fallthrough; case CSI_m_BG_COLOR_BEG ... CSI_m_BG_COLOR_END: vc->vc_par[i] -= CSI_m_BG_COLOR_BEG; vc->state.color = (color_table[vc->vc_par[i]] << 4) | (vc->state.color & 0x0f); break; } update_attr(vc); } static void respond_string(const char *p, size_t len, struct tty_port *port) { tty_insert_flip_string(port, p, len); tty_flip_buffer_push(port); } static void cursor_report(struct vc_data *vc, struct tty_struct *tty) { char buf[40]; int len; len = sprintf(buf, "\033[%d;%dR", vc->state.y + (vc->vc_decom ? vc->vc_top + 1 : 1), vc->state.x + 1); respond_string(buf, len, tty->port); } static inline void status_report(struct tty_struct *tty) { static const char teminal_ok[] = "\033[0n"; respond_string(teminal_ok, strlen(teminal_ok), tty->port); } static inline void respond_ID(struct tty_struct *tty) { /* terminal answer to an ESC-Z or csi0c query. */ static const char vt102_id[] = "\033[?6c"; respond_string(vt102_id, strlen(vt102_id), tty->port); } void mouse_report(struct tty_struct *tty, int butt, int mrx, int mry) { char buf[8]; int len; len = sprintf(buf, "\033[M%c%c%c", (char)(' ' + butt), (char)('!' + mrx), (char)('!' + mry)); respond_string(buf, len, tty->port); } /* invoked via ioctl(TIOCLINUX) and through set_selection_user */ int mouse_reporting(void) { return vc_cons[fg_console].d->vc_report_mouse; } enum { CSI_DEC_hl_CURSOR_KEYS = 1, /* CKM: cursor keys send ^[Ox/^[[x */ CSI_DEC_hl_132_COLUMNS = 3, /* COLM: 80/132 mode switch */ CSI_DEC_hl_REVERSE_VIDEO = 5, /* SCNM */ CSI_DEC_hl_ORIGIN_MODE = 6, /* OM: origin relative/absolute */ CSI_DEC_hl_AUTOWRAP = 7, /* AWM */ CSI_DEC_hl_AUTOREPEAT = 8, /* ARM */ CSI_DEC_hl_MOUSE_X10 = 9, CSI_DEC_hl_SHOW_CURSOR = 25, /* TCEM */ CSI_DEC_hl_MOUSE_VT200 = 1000, }; /* console_lock is held */ static void csi_DEC_hl(struct vc_data *vc, bool on_off) { unsigned int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { case CSI_DEC_hl_CURSOR_KEYS: if (on_off) set_kbd(vc, decckm); else clr_kbd(vc, decckm); break; case CSI_DEC_hl_132_COLUMNS: /* unimplemented */ #if 0 vc_resize(deccolm ? 132 : 80, vc->vc_rows); /* this alone does not suffice; some user mode utility has to change the hardware regs */ #endif break; case CSI_DEC_hl_REVERSE_VIDEO: if (vc->vc_decscnm != on_off) { vc->vc_decscnm = on_off; invert_screen(vc, 0, vc->vc_screenbuf_size, false); update_attr(vc); } break; case CSI_DEC_hl_ORIGIN_MODE: vc->vc_decom = on_off; gotoxay(vc, 0, 0); break; case CSI_DEC_hl_AUTOWRAP: vc->vc_decawm = on_off; break; case CSI_DEC_hl_AUTOREPEAT: if (on_off) set_kbd(vc, decarm); else clr_kbd(vc, decarm); break; case CSI_DEC_hl_MOUSE_X10: vc->vc_report_mouse = on_off ? 1 : 0; break; case CSI_DEC_hl_SHOW_CURSOR: vc->vc_deccm = on_off; break; case CSI_DEC_hl_MOUSE_VT200: vc->vc_report_mouse = on_off ? 2 : 0; break; } } enum { CSI_hl_DISPLAY_CTRL = 3, /* handle ansi control chars */ CSI_hl_INSERT = 4, /* IRM: insert/replace */ CSI_hl_AUTO_NL = 20, /* LNM: Enter == CrLf/Lf */ }; /* console_lock is held */ static void csi_hl(struct vc_data *vc, bool on_off) { unsigned int i; for (i = 0; i <= vc->vc_npar; i++) switch (vc->vc_par[i]) { /* ANSI modes set/reset */ case CSI_hl_DISPLAY_CTRL: vc->vc_disp_ctrl = on_off; break; case CSI_hl_INSERT: vc->vc_decim = on_off; break; case CSI_hl_AUTO_NL: if (on_off) set_kbd(vc, lnm); else clr_kbd(vc, lnm); break; } } enum CSI_right_square_bracket { CSI_RSB_COLOR_FOR_UNDERLINE = 1, CSI_RSB_COLOR_FOR_HALF_BRIGHT = 2, CSI_RSB_MAKE_CUR_COLOR_DEFAULT = 8, CSI_RSB_BLANKING_INTERVAL = 9, CSI_RSB_BELL_FREQUENCY = 10, CSI_RSB_BELL_DURATION = 11, CSI_RSB_BRING_CONSOLE_TO_FRONT = 12, CSI_RSB_UNBLANK = 13, CSI_RSB_VESA_OFF_INTERVAL = 14, CSI_RSB_BRING_PREV_CONSOLE_TO_FRONT = 15, CSI_RSB_CURSOR_BLINK_INTERVAL = 16, }; /* * csi_RSB - csi+] (Right Square Bracket) handler * * These are linux console private sequences. * * console_lock is held */ static void csi_RSB(struct vc_data *vc) { switch (vc->vc_par[0]) { case CSI_RSB_COLOR_FOR_UNDERLINE: if (vc->vc_can_do_color && vc->vc_par[1] < 16) { vc->vc_ulcolor = color_table[vc->vc_par[1]]; if (vc->state.underline) update_attr(vc); } break; case CSI_RSB_COLOR_FOR_HALF_BRIGHT: if (vc->vc_can_do_color && vc->vc_par[1] < 16) { vc->vc_halfcolor = color_table[vc->vc_par[1]]; if (vc->state.intensity == VCI_HALF_BRIGHT) update_attr(vc); } break; case CSI_RSB_MAKE_CUR_COLOR_DEFAULT: vc->vc_def_color = vc->vc_attr; if (vc->vc_hi_font_mask == 0x100) vc->vc_def_color >>= 1; default_attr(vc); update_attr(vc); break; case CSI_RSB_BLANKING_INTERVAL: blankinterval = min(vc->vc_par[1], 60U) * 60; poke_blanked_console(); break; case CSI_RSB_BELL_FREQUENCY: if (vc->vc_npar >= 1) vc->vc_bell_pitch = vc->vc_par[1]; else vc->vc_bell_pitch = DEFAULT_BELL_PITCH; break; case CSI_RSB_BELL_DURATION: if (vc->vc_npar >= 1) vc->vc_bell_duration = (vc->vc_par[1] < 2000) ? msecs_to_jiffies(vc->vc_par[1]) : 0; else vc->vc_bell_duration = DEFAULT_BELL_DURATION; break; case CSI_RSB_BRING_CONSOLE_TO_FRONT: if (vc->vc_par[1] >= 1 && vc_cons_allocated(vc->vc_par[1] - 1)) set_console(vc->vc_par[1] - 1); break; case CSI_RSB_UNBLANK: poke_blanked_console(); break; case CSI_RSB_VESA_OFF_INTERVAL: vesa_off_interval = min(vc->vc_par[1], 60U) * 60 * HZ; break; case CSI_RSB_BRING_PREV_CONSOLE_TO_FRONT: set_console(last_console); break; case CSI_RSB_CURSOR_BLINK_INTERVAL: if (vc->vc_npar >= 1 && vc->vc_par[1] >= 50 && vc->vc_par[1] <= USHRT_MAX) vc->vc_cur_blink_ms = vc->vc_par[1]; else vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; break; } } /* console_lock is held */ static void csi_at(struct vc_data *vc, unsigned int nr) { nr = clamp(nr, 1, vc->vc_cols - vc->state.x); insert_char(vc, nr); } /* console_lock is held */ static void csi_L(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_rows - vc->state.y); con_scroll(vc, vc->state.y, vc->vc_bottom, SM_DOWN, nr); vc->vc_need_wrap = 0; } /* console_lock is held */ static void csi_P(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_cols - vc->state.x); delete_char(vc, nr); } /* console_lock is held */ static void csi_M(struct vc_data *vc) { unsigned int nr = clamp(vc->vc_par[0], 1, vc->vc_rows - vc->state.y); con_scroll(vc, vc->state.y, vc->vc_bottom, SM_UP, nr); vc->vc_need_wrap = 0; } /* console_lock is held (except via vc_init->reset_terminal */ static void save_cur(struct vc_data *vc) { memcpy(&vc->saved_state, &vc->state, sizeof(vc->state)); } /* console_lock is held */ static void restore_cur(struct vc_data *vc) { memcpy(&vc->state, &vc->saved_state, sizeof(vc->state)); gotoxy(vc, vc->state.x, vc->state.y); vc->vc_translate = set_translate(vc->state.Gx_charset[vc->state.charset], vc); update_attr(vc); vc->vc_need_wrap = 0; } /** * enum vc_ctl_state - control characters state of a vt * * @ESnormal: initial state, no control characters parsed * @ESesc: ESC parsed * @ESsquare: CSI parsed -- modifiers/parameters/ctrl chars expected * @ESgetpars: CSI parsed -- parameters/ctrl chars expected * @ESfunckey: CSI [ parsed * @EShash: ESC # parsed * @ESsetG0: ESC ( parsed * @ESsetG1: ESC ) parsed * @ESpercent: ESC % parsed * @EScsiignore: CSI [0x20-0x3f] parsed * @ESnonstd: OSC parsed * @ESpalette: OSC P parsed * @ESosc: OSC [0-9] parsed * @ESANSI_first: first state for ignoring ansi control sequences * @ESapc: ESC _ parsed * @ESpm: ESC ^ parsed * @ESdcs: ESC P parsed * @ESANSI_last: last state for ignoring ansi control sequences */ enum vc_ctl_state { ESnormal, ESesc, ESsquare, ESgetpars, ESfunckey, EShash, ESsetG0, ESsetG1, ESpercent, EScsiignore, ESnonstd, ESpalette, ESosc, ESANSI_first = ESosc, ESapc, ESpm, ESdcs, ESANSI_last = ESdcs, }; /* console_lock is held (except via vc_init()) */ static void reset_terminal(struct vc_data *vc, int do_clear) { unsigned int i; vc->vc_top = 0; vc->vc_bottom = vc->vc_rows; vc->vc_state = ESnormal; vc->vc_priv = EPecma; vc->vc_translate = set_translate(LAT1_MAP, vc); vc->state.Gx_charset[0] = LAT1_MAP; vc->state.Gx_charset[1] = GRAF_MAP; vc->state.charset = 0; vc->vc_need_wrap = 0; vc->vc_report_mouse = 0; vc->vc_utf = default_utf8; vc->vc_utf_count = 0; vc->vc_disp_ctrl = 0; vc->vc_toggle_meta = 0; vc->vc_decscnm = 0; vc->vc_decom = 0; vc->vc_decawm = 1; vc->vc_deccm = global_cursor_default; vc->vc_decim = 0; vt_reset_keyboard(vc->vc_num); vc->vc_cursor_type = cur_default; vc->vc_complement_mask = vc->vc_s_complement_mask; default_attr(vc); update_attr(vc); bitmap_zero(vc->vc_tab_stop, VC_TABSTOPS_COUNT); for (i = 0; i < VC_TABSTOPS_COUNT; i += 8) set_bit(i, vc->vc_tab_stop); vc->vc_bell_pitch = DEFAULT_BELL_PITCH; vc->vc_bell_duration = DEFAULT_BELL_DURATION; vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS; gotoxy(vc, 0, 0); save_cur(vc); if (do_clear) csi_J(vc, CSI_J_VISIBLE); } static void vc_setGx(struct vc_data *vc, unsigned int which, u8 c) { unsigned char *charset = &vc->state.Gx_charset[which]; switch (c) { case '0': *charset = GRAF_MAP; break; case 'B': *charset = LAT1_MAP; break; case 'U': *charset = IBMPC_MAP; break; case 'K': *charset = USER_MAP; break; } if (vc->state.charset == which) vc->vc_translate = set_translate(*charset, vc); } static bool ansi_control_string(enum vc_ctl_state state) { return state >= ESANSI_first && state <= ESANSI_last; } enum { ASCII_NULL = 0, ASCII_BELL = 7, ASCII_BACKSPACE = 8, ASCII_IGNORE_FIRST = ASCII_BACKSPACE, ASCII_HTAB = 9, ASCII_LINEFEED = 10, ASCII_VTAB = 11, ASCII_FORMFEED = 12, ASCII_CAR_RET = 13, ASCII_IGNORE_LAST = ASCII_CAR_RET, ASCII_SHIFTOUT = 14, ASCII_SHIFTIN = 15, ASCII_CANCEL = 24, ASCII_SUBSTITUTE = 26, ASCII_ESCAPE = 27, ASCII_CSI_IGNORE_FIRST = ' ', /* 0x2x, 0x3a and 0x3c - 0x3f */ ASCII_CSI_IGNORE_LAST = '?', ASCII_DEL = 127, ASCII_EXT_CSI = 128 + ASCII_ESCAPE, }; /* * Handle ascii characters in control sequences and change states accordingly. * E.g. ESC sets the state of vc to ESesc. * * Returns: true if @c handled. */ static bool handle_ascii(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case ASCII_NULL: return true; case ASCII_BELL: if (ansi_control_string(vc->vc_state)) vc->vc_state = ESnormal; else if (vc->vc_bell_duration) kd_mksound(vc->vc_bell_pitch, vc->vc_bell_duration); return true; case ASCII_BACKSPACE: bs(vc); return true; case ASCII_HTAB: vc->vc_pos -= (vc->state.x << 1); vc->state.x = find_next_bit(vc->vc_tab_stop, min(vc->vc_cols - 1, VC_TABSTOPS_COUNT), vc->state.x + 1); if (vc->state.x >= VC_TABSTOPS_COUNT) vc->state.x = vc->vc_cols - 1; vc->vc_pos += (vc->state.x << 1); notify_write(vc, '\t'); return true; case ASCII_LINEFEED: case ASCII_VTAB: case ASCII_FORMFEED: lf(vc); if (!is_kbd(vc, lnm)) return true; fallthrough; case ASCII_CAR_RET: cr(vc); return true; case ASCII_SHIFTOUT: vc->state.charset = 1; vc->vc_translate = set_translate(vc->state.Gx_charset[1], vc); vc->vc_disp_ctrl = 1; return true; case ASCII_SHIFTIN: vc->state.charset = 0; vc->vc_translate = set_translate(vc->state.Gx_charset[0], vc); vc->vc_disp_ctrl = 0; return true; case ASCII_CANCEL: case ASCII_SUBSTITUTE: vc->vc_state = ESnormal; return true; case ASCII_ESCAPE: vc->vc_state = ESesc; return true; case ASCII_DEL: del(vc); return true; case ASCII_EXT_CSI: vc->vc_state = ESsquare; return true; } return false; } /* * Handle a character (@c) following an ESC (when @vc is in the ESesc state). * E.g. previous ESC with @c == '[' here yields the ESsquare state (that is: * CSI). */ static void handle_esc(struct tty_struct *tty, struct vc_data *vc, u8 c) { vc->vc_state = ESnormal; switch (c) { case '[': vc->vc_state = ESsquare; break; case ']': vc->vc_state = ESnonstd; break; case '_': vc->vc_state = ESapc; break; case '^': vc->vc_state = ESpm; break; case '%': vc->vc_state = ESpercent; break; case 'E': cr(vc); lf(vc); break; case 'M': ri(vc); break; case 'D': lf(vc); break; case 'H': if (vc->state.x < VC_TABSTOPS_COUNT) set_bit(vc->state.x, vc->vc_tab_stop); break; case 'P': vc->vc_state = ESdcs; break; case 'Z': respond_ID(tty); break; case '7': save_cur(vc); break; case '8': restore_cur(vc); break; case '(': vc->vc_state = ESsetG0; break; case ')': vc->vc_state = ESsetG1; break; case '#': vc->vc_state = EShash; break; case 'c': reset_terminal(vc, 1); break; case '>': /* Numeric keypad */ clr_kbd(vc, kbdapplic); break; case '=': /* Appl. keypad */ set_kbd(vc, kbdapplic); break; } } /* * Handle special DEC control sequences ("ESC [ ? parameters char"). Parameters * are in @vc->vc_par and the char is in @c here. */ static void csi_DEC(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case 'h': csi_DEC_hl(vc, true); break; case 'l': csi_DEC_hl(vc, false); break; case 'c': if (vc->vc_par[0]) vc->vc_cursor_type = CUR_MAKE(vc->vc_par[0], vc->vc_par[1], vc->vc_par[2]); else vc->vc_cursor_type = cur_default; break; case 'm': clear_selection(); if (vc->vc_par[0]) vc->vc_complement_mask = vc->vc_par[0] << 8 | vc->vc_par[1]; else vc->vc_complement_mask = vc->vc_s_complement_mask; break; case 'n': if (vc->vc_par[0] == 5) status_report(tty); else if (vc->vc_par[0] == 6) cursor_report(vc, tty); break; } } /* * Handle Control Sequence Introducer control characters. That is * "ESC [ parameters char". Parameters are in @vc->vc_par and the char is in * @c here. */ static void csi_ECMA(struct tty_struct *tty, struct vc_data *vc, u8 c) { switch (c) { case 'G': case '`': if (vc->vc_par[0]) vc->vc_par[0]--; gotoxy(vc, vc->vc_par[0], vc->state.y); break; case 'A': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x, vc->state.y - vc->vc_par[0]); break; case 'B': case 'e': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x, vc->state.y + vc->vc_par[0]); break; case 'C': case 'a': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x + vc->vc_par[0], vc->state.y); break; case 'D': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, vc->state.x - vc->vc_par[0], vc->state.y); break; case 'E': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, 0, vc->state.y + vc->vc_par[0]); break; case 'F': if (!vc->vc_par[0]) vc->vc_par[0]++; gotoxy(vc, 0, vc->state.y - vc->vc_par[0]); break; case 'd': if (vc->vc_par[0]) vc->vc_par[0]--; gotoxay(vc, vc->state.x ,vc->vc_par[0]); break; case 'H': case 'f': if (vc->vc_par[0]) vc->vc_par[0]--; if (vc->vc_par[1]) vc->vc_par[1]--; gotoxay(vc, vc->vc_par[1], vc->vc_par[0]); break; case 'J': csi_J(vc, vc->vc_par[0]); break; case 'K': csi_K(vc); break; case 'L': csi_L(vc); break; case 'M': csi_M(vc); break; case 'P': csi_P(vc); break; case 'c': if (!vc->vc_par[0]) respond_ID(tty); break; case 'g': if (!vc->vc_par[0] && vc->state.x < VC_TABSTOPS_COUNT) set_bit(vc->state.x, vc->vc_tab_stop); else if (vc->vc_par[0] == 3) bitmap_zero(vc->vc_tab_stop, VC_TABSTOPS_COUNT); break; case 'h': csi_hl(vc, true); break; case 'l': csi_hl(vc, false); break; case 'm': csi_m(vc); break; case 'n': if (vc->vc_par[0] == 5) status_report(tty); else if (vc->vc_par[0] == 6) cursor_report(vc, tty); break; case 'q': /* DECLL - but only 3 leds */ /* map 0,1,2,3 to 0,1,2,4 */ if (vc->vc_par[0] < 4) vt_set_led_state(vc->vc_num, (vc->vc_par[0] < 3) ? vc->vc_par[0] : 4); break; case 'r': if (!vc->vc_par[0]) vc->vc_par[0]++; if (!vc->vc_par[1]) vc->vc_par[1] = vc->vc_rows; /* Minimum allowed region is 2 lines */ if (vc->vc_par[0] < vc->vc_par[1] && vc->vc_par[1] <= vc->vc_rows) { vc->vc_top = vc->vc_par[0] - 1; vc->vc_bottom = vc->vc_par[1]; gotoxay(vc, 0, 0); } break; case 's': save_cur(vc); break; case 'u': restore_cur(vc); break; case 'X': csi_X(vc); break; case '@': csi_at(vc, vc->vc_par[0]); break; case ']': csi_RSB(vc); break; } } static void vc_reset_params(struct vc_data *vc) { memset(vc->vc_par, 0, sizeof(vc->vc_par)); vc->vc_npar = 0; } /* console_lock is held */ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, u8 c) { /* * Control characters can be used in the _middle_ * of an escape sequence, aside from ANSI control strings. */ if (ansi_control_string(vc->vc_state) && c >= ASCII_IGNORE_FIRST && c <= ASCII_IGNORE_LAST) return; if (handle_ascii(tty, vc, c)) return; switch(vc->vc_state) { case ESesc: /* ESC */ handle_esc(tty, vc, c); return; case ESnonstd: /* ESC ] aka OSC */ switch (c) { case 'P': /* palette escape sequence */ vc_reset_params(vc); vc->vc_state = ESpalette; return; case 'R': /* reset palette */ reset_palette(vc); break; case '0' ... '9': vc->vc_state = ESosc; return; } vc->vc_state = ESnormal; return; case ESpalette: /* ESC ] P aka OSC P */ if (isxdigit(c)) { vc->vc_par[vc->vc_npar++] = hex_to_bin(c); if (vc->vc_npar == 7) { int i = vc->vc_par[0] * 3, j = 1; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i++] += vc->vc_par[j++]; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i++] += vc->vc_par[j++]; vc->vc_palette[i] = 16 * vc->vc_par[j++]; vc->vc_palette[i] += vc->vc_par[j]; set_palette(vc); vc->vc_state = ESnormal; } } else vc->vc_state = ESnormal; return; case ESsquare: /* ESC [ aka CSI, parameters or modifiers expected */ vc_reset_params(vc); vc->vc_state = ESgetpars; switch (c) { case '[': /* Function key */ vc->vc_state = ESfunckey; return; case '?': vc->vc_priv = EPdec; return; case '>': vc->vc_priv = EPgt; return; case '=': vc->vc_priv = EPeq; return; case '<': vc->vc_priv = EPlt; return; } vc->vc_priv = EPecma; fallthrough; case ESgetpars: /* ESC [ aka CSI, parameters expected */ switch (c) { case ';': if (vc->vc_npar < NPAR - 1) { vc->vc_npar++; return; } break; case '0' ... '9': vc->vc_par[vc->vc_npar] *= 10; vc->vc_par[vc->vc_npar] += c - '0'; return; } if (c >= ASCII_CSI_IGNORE_FIRST && c <= ASCII_CSI_IGNORE_LAST) { vc->vc_state = EScsiignore; return; } /* parameters done, handle the control char @c */ vc->vc_state = ESnormal; switch (vc->vc_priv) { case EPdec: csi_DEC(tty, vc, c); return; case EPecma: csi_ECMA(tty, vc, c); return; default: return; } case EScsiignore: if (c >= ASCII_CSI_IGNORE_FIRST && c <= ASCII_CSI_IGNORE_LAST) return; vc->vc_state = ESnormal; return; case ESpercent: /* ESC % */ vc->vc_state = ESnormal; switch (c) { case '@': /* defined in ISO 2022 */ vc->vc_utf = 0; return; case 'G': /* prelim official escape code */ case '8': /* retained for compatibility */ vc->vc_utf = 1; return; } return; case ESfunckey: /* ESC [ [ aka CSI [ */ vc->vc_state = ESnormal; return; case EShash: /* ESC # */ vc->vc_state = ESnormal; if (c == '8') { /* DEC screen alignment test. kludge :-) */ vc->vc_video_erase_char = (vc->vc_video_erase_char & 0xff00) | 'E'; csi_J(vc, CSI_J_VISIBLE); vc->vc_video_erase_char = (vc->vc_video_erase_char & 0xff00) | ' '; do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); } return; case ESsetG0: /* ESC ( */ vc_setGx(vc, 0, c); vc->vc_state = ESnormal; return; case ESsetG1: /* ESC ) */ vc_setGx(vc, 1, c); vc->vc_state = ESnormal; return; case ESapc: /* ESC _ */ return; case ESosc: /* ESC ] [0-9] aka OSC [0-9] */ return; case ESpm: /* ESC ^ */ return; case ESdcs: /* ESC P */ return; default: vc->vc_state = ESnormal; } } /* is_double_width() is based on the wcwidth() implementation by * Markus Kuhn -- 2007-05-26 (Unicode 5.0) * Latest version: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ struct interval { uint32_t first; uint32_t last; }; static int ucs_cmp(const void *key, const void *elt) { uint32_t ucs = *(uint32_t *)key; struct interval e = *(struct interval *) elt; if (ucs > e.last) return 1; else if (ucs < e.first) return -1; return 0; } static int is_double_width(uint32_t ucs) { static const struct interval double_width[] = { { 0x1100, 0x115F }, { 0x2329, 0x232A }, { 0x2E80, 0x303E }, { 0x3040, 0xA4CF }, { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF }, { 0xFE10, 0xFE19 }, { 0xFE30, 0xFE6F }, { 0xFF00, 0xFF60 }, { 0xFFE0, 0xFFE6 }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD } }; if (ucs < double_width[0].first || ucs > double_width[ARRAY_SIZE(double_width) - 1].last) return 0; return bsearch(&ucs, double_width, ARRAY_SIZE(double_width), sizeof(struct interval), ucs_cmp) != NULL; } struct vc_draw_region { unsigned long from, to; int x; }; static void con_flush(struct vc_data *vc, struct vc_draw_region *draw) { if (draw->x < 0) return; vc->vc_sw->con_putcs(vc, (u16 *)draw->from, (u16 *)draw->to - (u16 *)draw->from, vc->state.y, draw->x); draw->x = -1; } static inline int vc_translate_ascii(const struct vc_data *vc, int c) { if (IS_ENABLED(CONFIG_CONSOLE_TRANSLATIONS)) { if (vc->vc_toggle_meta) c |= 0x80; return vc->vc_translate[c]; } return c; } /** * vc_sanitize_unicode - Replace invalid Unicode code points with ``U+FFFD`` * @c: the received code point */ static inline int vc_sanitize_unicode(const int c) { if (c >= 0xd800 && c <= 0xdfff) return 0xfffd; return c; } /** * vc_translate_unicode - Combine UTF-8 into Unicode in &vc_data.vc_utf_char * @vc: virtual console * @c: UTF-8 byte to translate * @rescan: set to true iff @c wasn't consumed here and needs to be re-processed * * * &vc_data.vc_utf_char is the being-constructed Unicode code point. * * &vc_data.vc_utf_count is the number of continuation bytes still expected to * arrive. * * &vc_data.vc_npar is the number of continuation bytes arrived so far. * * Return: * * %-1 - Input OK so far, @c consumed, further bytes expected. * * %0xFFFD - Possibility 1: input invalid, @c may have been consumed (see * desc. of @rescan). Possibility 2: input OK, @c consumed, * ``U+FFFD`` is the resulting code point. ``U+FFFD`` is valid, * ``REPLACEMENT CHARACTER``. * * otherwise - Input OK, @c consumed, resulting code point returned. */ static int vc_translate_unicode(struct vc_data *vc, int c, bool *rescan) { static const u32 utf8_length_changes[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; /* Continuation byte received */ if ((c & 0xc0) == 0x80) { /* Unexpected continuation byte? */ if (!vc->vc_utf_count) return 0xfffd; vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f); vc->vc_npar++; if (--vc->vc_utf_count) goto need_more_bytes; /* Got a whole character */ c = vc->vc_utf_char; /* Reject overlong sequences */ if (c <= utf8_length_changes[vc->vc_npar - 1] || c > utf8_length_changes[vc->vc_npar]) return 0xfffd; return vc_sanitize_unicode(c); } /* Single ASCII byte or first byte of a sequence received */ if (vc->vc_utf_count) { /* Continuation byte expected */ *rescan = true; vc->vc_utf_count = 0; return 0xfffd; } /* Nothing to do if an ASCII byte was received */ if (c <= 0x7f) return c; /* First byte of a multibyte sequence received */ vc->vc_npar = 0; if ((c & 0xe0) == 0xc0) { vc->vc_utf_count = 1; vc->vc_utf_char = (c & 0x1f); } else if ((c & 0xf0) == 0xe0) { vc->vc_utf_count = 2; vc->vc_utf_char = (c & 0x0f); } else if ((c & 0xf8) == 0xf0) { vc->vc_utf_count = 3; vc->vc_utf_char = (c & 0x07); } else { return 0xfffd; } need_more_bytes: return -1; } static int vc_translate(struct vc_data *vc, int *c, bool *rescan) { /* Do no translation at all in control states */ if (vc->vc_state != ESnormal) return *c; if (vc->vc_utf && !vc->vc_disp_ctrl) return *c = vc_translate_unicode(vc, *c, rescan); /* no utf or alternate charset mode */ return vc_translate_ascii(vc, *c); } static inline unsigned char vc_invert_attr(const struct vc_data *vc) { if (!vc->vc_can_do_color) return vc->vc_attr ^ 0x08; if (vc->vc_hi_font_mask == 0x100) return (vc->vc_attr & 0x11) | ((vc->vc_attr & 0xe0) >> 4) | ((vc->vc_attr & 0x0e) << 4); return (vc->vc_attr & 0x88) | ((vc->vc_attr & 0x70) >> 4) | ((vc->vc_attr & 0x07) << 4); } static bool vc_is_control(struct vc_data *vc, int tc, int c) { /* * A bitmap for codes <32. A bit of 1 indicates that the code * corresponding to that bit number invokes some special action (such * as cursor movement) and should not be displayed as a glyph unless * the disp_ctrl mode is explicitly enabled. */ static const u32 CTRL_ACTION = BIT(ASCII_NULL) | GENMASK(ASCII_SHIFTIN, ASCII_BELL) | BIT(ASCII_CANCEL) | BIT(ASCII_SUBSTITUTE) | BIT(ASCII_ESCAPE); /* Cannot be overridden by disp_ctrl */ static const u32 CTRL_ALWAYS = BIT(ASCII_NULL) | BIT(ASCII_BACKSPACE) | BIT(ASCII_LINEFEED) | BIT(ASCII_SHIFTIN) | BIT(ASCII_SHIFTOUT) | BIT(ASCII_CAR_RET) | BIT(ASCII_FORMFEED) | BIT(ASCII_ESCAPE); if (vc->vc_state != ESnormal) return true; if (!tc) return true; /* * If the original code was a control character we only allow a glyph * to be displayed if the code is not normally used (such as for cursor * movement) or if the disp_ctrl mode has been explicitly enabled. * Certain characters (as given by the CTRL_ALWAYS bitmap) are always * displayed as control characters, as the console would be pretty * useless without them; to display an arbitrary font position use the * direct-to-font zone in UTF-8 mode. */ if (c < BITS_PER_TYPE(CTRL_ALWAYS)) { if (vc->vc_disp_ctrl) return CTRL_ALWAYS & BIT(c); else return vc->vc_utf || (CTRL_ACTION & BIT(c)); } if (c == ASCII_DEL && !vc->vc_disp_ctrl) return true; if (c == ASCII_EXT_CSI) return true; return false; } static int vc_con_write_normal(struct vc_data *vc, int tc, int c, struct vc_draw_region *draw) { int next_c; unsigned char vc_attr = vc->vc_attr; u16 himask = vc->vc_hi_font_mask, charmask = himask ? 0x1ff : 0xff; u8 width = 1; bool inverse = false; if (vc->vc_utf && !vc->vc_disp_ctrl) { if (is_double_width(c)) width = 2; } /* Now try to find out how to display it */ tc = conv_uni_to_pc(vc, tc); if (tc & ~charmask) { if (tc == -1 || tc == -2) return -1; /* nothing to display */ /* Glyph not found */ if ((!vc->vc_utf || vc->vc_disp_ctrl || c < 128) && !(c & ~charmask)) { /* * In legacy mode use the glyph we get by a 1:1 * mapping. * This would make absolutely no sense with Unicode in * mind, but do this for ASCII characters since a font * may lack Unicode mapping info and we don't want to * end up with having question marks only. */ tc = c; } else { /* * Display U+FFFD. If it's not found, display an inverse * question mark. */ tc = conv_uni_to_pc(vc, 0xfffd); if (tc < 0) { inverse = true; tc = conv_uni_to_pc(vc, '?'); if (tc < 0) tc = '?'; vc_attr = vc_invert_attr(vc); con_flush(vc, draw); } } } next_c = c; while (1) { if (vc->vc_need_wrap || vc->vc_decim) con_flush(vc, draw); if (vc->vc_need_wrap) { cr(vc); lf(vc); } if (vc->vc_decim) insert_char(vc, 1); vc_uniscr_putc(vc, next_c); if (himask) tc = ((tc & 0x100) ? himask : 0) | (tc & 0xff); tc |= (vc_attr << 8) & ~himask; scr_writew(tc, (u16 *)vc->vc_pos); if (con_should_update(vc) && draw->x < 0) { draw->x = vc->state.x; draw->from = vc->vc_pos; } if (vc->state.x == vc->vc_cols - 1) { vc->vc_need_wrap = vc->vc_decawm; draw->to = vc->vc_pos + 2; } else { vc->state.x++; draw->to = (vc->vc_pos += 2); } if (!--width) break; /* A space is printed in the second column */ tc = conv_uni_to_pc(vc, ' '); if (tc < 0) tc = ' '; next_c = ' '; } notify_write(vc, c); if (inverse) con_flush(vc, draw); return 0; } /* acquires console_lock */ static int do_con_write(struct tty_struct *tty, const u8 *buf, int count) { struct vc_draw_region draw = { .x = -1, }; int c, tc, n = 0; unsigned int currcons; struct vc_data *vc = tty->driver_data; struct vt_notifier_param param; bool rescan; if (in_interrupt()) return count; console_lock(); currcons = vc->vc_num; if (!vc_cons_allocated(currcons)) { /* could this happen? */ pr_warn_once("con_write: tty %d not allocated\n", currcons+1); console_unlock(); return 0; } /* undraw cursor first */ if (con_is_fg(vc)) hide_cursor(vc); param.vc = vc; while (!tty->flow.stopped && count) { u8 orig = *buf; buf++; n++; count--; rescan_last_byte: c = orig; rescan = false; tc = vc_translate(vc, &c, &rescan); if (tc == -1) continue; param.c = tc; if (atomic_notifier_call_chain(&vt_notifier_list, VT_PREWRITE, &param) == NOTIFY_STOP) continue; if (vc_is_control(vc, tc, c)) { con_flush(vc, &draw); do_con_trol(tty, vc, orig); continue; } if (vc_con_write_normal(vc, tc, c, &draw) < 0) continue; if (rescan) goto rescan_last_byte; } con_flush(vc, &draw); console_conditional_schedule(); notify_update(vc); console_unlock(); return n; } /* * This is the console switching callback. * * Doing console switching in a process context allows * us to do the switches asynchronously (needed when we want * to switch due to a keyboard interrupt). Synchronization * with other console code and prevention of re-entrancy is * ensured with console_lock. */ static void console_callback(struct work_struct *ignored) { console_lock(); if (want_console >= 0) { if (want_console != fg_console && vc_cons_allocated(want_console)) { hide_cursor(vc_cons[fg_console].d); change_console(vc_cons[want_console].d); /* we only changed when the console had already been allocated - a new console is not created in an interrupt routine */ } want_console = -1; } if (do_poke_blanked_console) { /* do not unblank for a LED change */ do_poke_blanked_console = 0; poke_blanked_console(); } if (scrollback_delta) { struct vc_data *vc = vc_cons[fg_console].d; clear_selection(); if (vc->vc_mode == KD_TEXT && vc->vc_sw->con_scrolldelta) vc->vc_sw->con_scrolldelta(vc, scrollback_delta); scrollback_delta = 0; } if (blank_timer_expired) { do_blank_screen(0); blank_timer_expired = 0; } notify_update(vc_cons[fg_console].d); console_unlock(); } int set_console(int nr) { struct vc_data *vc = vc_cons[fg_console].d; if (!vc_cons_allocated(nr) || vt_dont_switch || (vc->vt_mode.mode == VT_AUTO && vc->vc_mode == KD_GRAPHICS)) { /* * Console switch will fail in console_callback() or * change_console() so there is no point scheduling * the callback * * Existing set_console() users don't check the return * value so this shouldn't break anything */ return -EINVAL; } want_console = nr; schedule_console_callback(); return 0; } struct tty_driver *console_driver; #ifdef CONFIG_VT_CONSOLE /** * vt_kmsg_redirect() - sets/gets the kernel message console * @new: the new virtual terminal number or -1 if the console should stay * unchanged * * By default, the kernel messages are always printed on the current virtual * console. However, the user may modify that default with the * %TIOCL_SETKMSGREDIRECT ioctl call. * * This function sets the kernel message console to be @new. It returns the old * virtual console number. The virtual terminal number %0 (both as parameter and * return value) means no redirection (i.e. always printed on the currently * active console). * * The parameter -1 means that only the current console is returned, but the * value is not modified. You may use the macro vt_get_kmsg_redirect() in that * case to make the code more understandable. * * When the kernel is compiled without %CONFIG_VT_CONSOLE, this function ignores * the parameter and always returns %0. */ int vt_kmsg_redirect(int new) { static int kmsg_con; if (new != -1) return xchg(&kmsg_con, new); else return kmsg_con; } /* * Console on virtual terminal * * The console must be locked when we get here. */ static void vt_console_print(struct console *co, const char *b, unsigned count) { struct vc_data *vc = vc_cons[fg_console].d; unsigned char c; static DEFINE_SPINLOCK(printing_lock); const ushort *start; ushort start_x, cnt; int kmsg_console; WARN_CONSOLE_UNLOCKED(); /* this protects against concurrent oops only */ if (!spin_trylock(&printing_lock)) return; kmsg_console = vt_get_kmsg_redirect(); if (kmsg_console && vc_cons_allocated(kmsg_console - 1)) vc = vc_cons[kmsg_console - 1].d; if (!vc_cons_allocated(fg_console)) { /* impossible */ /* printk("vt_console_print: tty %d not allocated ??\n", currcons+1); */ goto quit; } if (vc->vc_mode != KD_TEXT) goto quit; /* undraw cursor first */ if (con_is_fg(vc)) hide_cursor(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; cnt = 0; while (count--) { c = *b++; if (c == ASCII_LINEFEED || c == ASCII_CAR_RET || c == ASCII_BACKSPACE || vc->vc_need_wrap) { if (cnt && con_is_visible(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->state.y, start_x); cnt = 0; if (c == ASCII_BACKSPACE) { bs(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; continue; } if (c != ASCII_CAR_RET) lf(vc); cr(vc); start = (ushort *)vc->vc_pos; start_x = vc->state.x; if (c == ASCII_LINEFEED || c == ASCII_CAR_RET) continue; } vc_uniscr_putc(vc, c); scr_writew((vc->vc_attr << 8) + c, (unsigned short *)vc->vc_pos); notify_write(vc, c); cnt++; if (vc->state.x == vc->vc_cols - 1) { vc->vc_need_wrap = 1; } else { vc->vc_pos += 2; vc->state.x++; } } if (cnt && con_is_visible(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->state.y, start_x); set_cursor(vc); notify_update(vc); quit: spin_unlock(&printing_lock); } static struct tty_driver *vt_console_device(struct console *c, int *index) { *index = c->index ? c->index-1 : fg_console; return console_driver; } static int vt_console_setup(struct console *co, char *options) { return co->index >= MAX_NR_CONSOLES ? -EINVAL : 0; } static struct console vt_console_driver = { .name = "tty", .setup = vt_console_setup, .write = vt_console_print, .device = vt_console_device, .unblank = unblank_screen, .flags = CON_PRINTBUFFER, .index = -1, }; #endif /* * Handling of Linux-specific VC ioctls */ /* * Generally a bit racy with respect to console_lock();. * * There are some functions which don't need it. * * There are some functions which can sleep for arbitrary periods * (paste_selection) but we don't need the lock there anyway. * * set_selection_user has locking, and definitely needs it */ int tioclinux(struct tty_struct *tty, unsigned long arg) { char type, data; char __user *p = (char __user *)arg; void __user *param_aligned32 = (u32 __user *)arg + 1; void __user *param = (void __user *)arg + 1; int lines; int ret; if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(type, p)) return -EFAULT; ret = 0; switch (type) { case TIOCL_SETSEL: return set_selection_user(param, tty); case TIOCL_PASTESEL: if (!capable(CAP_SYS_ADMIN)) return -EPERM; return paste_selection(tty); case TIOCL_UNBLANKSCREEN: console_lock(); unblank_screen(); console_unlock(); break; case TIOCL_SELLOADLUT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; return sel_loadlut(param_aligned32); case TIOCL_GETSHIFTSTATE: /* * Make it possible to react to Shift+Mousebutton. Note that * 'shift_state' is an undocumented kernel-internal variable; * programs not closely related to the kernel should not use * this. */ data = vt_get_shift_state(); return put_user(data, p); case TIOCL_GETMOUSEREPORTING: console_lock(); /* May be overkill */ data = mouse_reporting(); console_unlock(); return put_user(data, p); case TIOCL_SETVESABLANK: return set_vesa_blanking(param); case TIOCL_GETKMSGREDIRECT: data = vt_get_kmsg_redirect(); return put_user(data, p); case TIOCL_SETKMSGREDIRECT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(data, p+1)) return -EFAULT; vt_kmsg_redirect(data); break; case TIOCL_GETFGCONSOLE: /* * No locking needed as this is a transiently correct return * anyway if the caller hasn't disabled switching. */ return fg_console; case TIOCL_SCROLLCONSOLE: if (get_user(lines, (s32 __user *)param_aligned32)) return -EFAULT; /* * Needs the console lock here. Note that lots of other calls * need fixing before the lock is actually useful! */ console_lock(); scrollfront(vc_cons[fg_console].d, lines); console_unlock(); break; case TIOCL_BLANKSCREEN: /* until explicitly unblanked, not only poked */ console_lock(); ignore_poke = 1; do_blank_screen(0); console_unlock(); break; case TIOCL_BLANKEDSCREEN: return console_blanked; default: return -EINVAL; } return ret; } /* * /dev/ttyN handling */ static ssize_t con_write(struct tty_struct *tty, const u8 *buf, size_t count) { int retval; retval = do_con_write(tty, buf, count); con_flush_chars(tty); return retval; } static int con_put_char(struct tty_struct *tty, u8 ch) { return do_con_write(tty, &ch, 1); } static unsigned int con_write_room(struct tty_struct *tty) { if (tty->flow.stopped) return 0; return 32768; /* No limit, really; we're not buffering */ } /* * con_throttle and con_unthrottle are only used for * paste_selection(), which has to stuff in a large number of * characters... */ static void con_throttle(struct tty_struct *tty) { } static void con_unthrottle(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; wake_up_interruptible(&vc->paste_wait); } /* * Turn the Scroll-Lock LED on when the tty is stopped */ static void con_stop(struct tty_struct *tty) { int console_num; if (!tty) return; console_num = tty->index; if (!vc_cons_allocated(console_num)) return; vt_kbd_con_stop(console_num); } /* * Turn the Scroll-Lock LED off when the console is started */ static void con_start(struct tty_struct *tty) { int console_num; if (!tty) return; console_num = tty->index; if (!vc_cons_allocated(console_num)) return; vt_kbd_con_start(console_num); } static void con_flush_chars(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; if (in_interrupt()) /* from flush_to_ldisc */ return; console_lock(); set_cursor(vc); console_unlock(); } /* * Allocate the console screen memory. */ static int con_install(struct tty_driver *driver, struct tty_struct *tty) { unsigned int currcons = tty->index; struct vc_data *vc; int ret; console_lock(); ret = vc_allocate(currcons); if (ret) goto unlock; vc = vc_cons[currcons].d; /* Still being freed */ if (vc->port.tty) { ret = -ERESTARTSYS; goto unlock; } ret = tty_port_install(&vc->port, driver, tty); if (ret) goto unlock; tty->driver_data = vc; vc->port.tty = tty; tty_port_get(&vc->port); if (!tty->winsize.ws_row && !tty->winsize.ws_col) { tty->winsize.ws_row = vc_cons[currcons].d->vc_rows; tty->winsize.ws_col = vc_cons[currcons].d->vc_cols; } if (vc->vc_utf) tty->termios.c_iflag |= IUTF8; else tty->termios.c_iflag &= ~IUTF8; unlock: console_unlock(); return ret; } static int con_open(struct tty_struct *tty, struct file *filp) { /* everything done in install */ return 0; } static void con_close(struct tty_struct *tty, struct file *filp) { /* Nothing to do - we defer to shutdown */ } static void con_shutdown(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; BUG_ON(vc == NULL); console_lock(); vc->port.tty = NULL; console_unlock(); } static void con_cleanup(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; tty_port_put(&vc->port); } /* * We can't deal with anything but the N_TTY ldisc, * because we can sleep in our write() routine. */ static int con_ldisc_ok(struct tty_struct *tty, int ldisc) { return ldisc == N_TTY ? 0 : -EINVAL; } static int default_color = 7; /* white */ static int default_italic_color = 2; // green (ASCII) static int default_underline_color = 3; // cyan (ASCII) module_param_named(color, default_color, int, S_IRUGO | S_IWUSR); module_param_named(italic, default_italic_color, int, S_IRUGO | S_IWUSR); module_param_named(underline, default_underline_color, int, S_IRUGO | S_IWUSR); static void vc_init(struct vc_data *vc, int do_clear) { int j, k ; set_origin(vc); vc->vc_pos = vc->vc_origin; reset_vc(vc); for (j=k=0; j<16; j++) { vc->vc_palette[k++] = default_red[j] ; vc->vc_palette[k++] = default_grn[j] ; vc->vc_palette[k++] = default_blu[j] ; } vc->vc_def_color = default_color; vc->vc_ulcolor = default_underline_color; vc->vc_itcolor = default_italic_color; vc->vc_halfcolor = 0x08; /* grey */ init_waitqueue_head(&vc->paste_wait); reset_terminal(vc, do_clear); } /* * This routine initializes console interrupts, and does nothing * else. If you want the screen to clear, call tty_write with * the appropriate escape-sequence. */ static int __init con_init(void) { const char *display_desc = NULL; struct vc_data *vc; unsigned int currcons = 0, i; console_lock(); if (!conswitchp) conswitchp = &dummy_con; display_desc = conswitchp->con_startup(); if (!display_desc) { fg_console = 0; console_unlock(); return 0; } for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = &registered_con_driver[i]; if (con_driver->con == NULL) { con_driver->con = conswitchp; con_driver->desc = display_desc; con_driver->flag = CON_DRIVER_FLAG_INIT; con_driver->first = 0; con_driver->last = MAX_NR_CONSOLES - 1; break; } } for (i = 0; i < MAX_NR_CONSOLES; i++) con_driver_map[i] = conswitchp; if (blankinterval) { blank_state = blank_normal_wait; mod_timer(&console_timer, jiffies + (blankinterval * HZ)); } for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) { vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT); INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); tty_port_init(&vc->port); visual_init(vc, currcons, true); /* Assuming vc->vc_{cols,rows,screenbuf_size} are sane here. */ vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); vc_init(vc, currcons || !vc->vc_sw->con_save_screen); } currcons = fg_console = 0; master_display_fg = vc = vc_cons[currcons].d; set_origin(vc); save_screen(vc); gotoxy(vc, vc->state.x, vc->state.y); csi_J(vc, CSI_J_CURSOR_TO_END); update_screen(vc); pr_info("Console: %s %s %dx%d\n", vc->vc_can_do_color ? "colour" : "mono", display_desc, vc->vc_cols, vc->vc_rows); console_unlock(); #ifdef CONFIG_VT_CONSOLE register_console(&vt_console_driver); #endif return 0; } console_initcall(con_init); static const struct tty_operations con_ops = { .install = con_install, .open = con_open, .close = con_close, .write = con_write, .write_room = con_write_room, .put_char = con_put_char, .flush_chars = con_flush_chars, .ioctl = vt_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vt_compat_ioctl, #endif .stop = con_stop, .start = con_start, .throttle = con_throttle, .unthrottle = con_unthrottle, .resize = vt_resize, .shutdown = con_shutdown, .cleanup = con_cleanup, .ldisc_ok = con_ldisc_ok, }; static struct cdev vc0_cdev; static ssize_t show_tty_active(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "tty%d\n", fg_console + 1); } static DEVICE_ATTR(active, S_IRUGO, show_tty_active, NULL); static struct attribute *vt_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(vt_dev); int __init vty_init(const struct file_operations *console_fops) { cdev_init(&vc0_cdev, console_fops); if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0) panic("Couldn't register /dev/tty0 driver\n"); tty0dev = device_create_with_groups(&tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, vt_dev_groups, "tty0"); if (IS_ERR(tty0dev)) tty0dev = NULL; vcs_init(); console_driver = tty_alloc_driver(MAX_NR_CONSOLES, TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS); if (IS_ERR(console_driver)) panic("Couldn't allocate console driver\n"); console_driver->name = "tty"; console_driver->name_base = 1; console_driver->major = TTY_MAJOR; console_driver->minor_start = 1; console_driver->type = TTY_DRIVER_TYPE_CONSOLE; console_driver->init_termios = tty_std_termios; if (default_utf8) console_driver->init_termios.c_iflag |= IUTF8; tty_set_operations(console_driver, &con_ops); if (tty_register_driver(console_driver)) panic("Couldn't register console driver\n"); kbd_init(); console_map_init(); #ifdef CONFIG_MDA_CONSOLE mda_console_init(); #endif return 0; } static const struct class vtconsole_class = { .name = "vtconsole", }; static int do_bind_con_driver(const struct consw *csw, int first, int last, int deflt) { struct module *owner = csw->owner; const char *desc = NULL; struct con_driver *con_driver; int i, j = -1, k = -1, retval = -ENODEV; if (!try_module_get(owner)) return -ENODEV; WARN_CONSOLE_UNLOCKED(); /* check if driver is registered */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = &registered_con_driver[i]; if (con_driver->con == csw) { desc = con_driver->desc; retval = 0; break; } } if (retval) goto err; if (!(con_driver->flag & CON_DRIVER_FLAG_INIT)) { csw->con_startup(); con_driver->flag |= CON_DRIVER_FLAG_INIT; } if (deflt) { if (conswitchp) module_put(conswitchp->owner); __module_get(owner); conswitchp = csw; } first = max(first, con_driver->first); last = min(last, con_driver->last); for (i = first; i <= last; i++) { int old_was_color; struct vc_data *vc = vc_cons[i].d; if (con_driver_map[i]) module_put(con_driver_map[i]->owner); __module_get(owner); con_driver_map[i] = csw; if (!vc || !vc->vc_sw) continue; j = i; if (con_is_visible(vc)) { k = i; save_screen(vc); } old_was_color = vc->vc_can_do_color; vc->vc_sw->con_deinit(vc); vc->vc_origin = (unsigned long)vc->vc_screenbuf; visual_init(vc, i, false); set_origin(vc); update_attr(vc); /* If the console changed between mono <-> color, then * the attributes in the screenbuf will be wrong. The * following resets all attributes to something sane. */ if (old_was_color != vc->vc_can_do_color) clear_buffer_attributes(vc); } pr_info("Console: switching "); if (!deflt) pr_cont("consoles %d-%d ", first + 1, last + 1); if (j >= 0) { struct vc_data *vc = vc_cons[j].d; pr_cont("to %s %s %dx%d\n", vc->vc_can_do_color ? "colour" : "mono", desc, vc->vc_cols, vc->vc_rows); if (k >= 0) { vc = vc_cons[k].d; update_screen(vc); } } else { pr_cont("to %s\n", desc); } retval = 0; err: module_put(owner); return retval; }; #ifdef CONFIG_VT_HW_CONSOLE_BINDING int do_unbind_con_driver(const struct consw *csw, int first, int last, int deflt) { struct module *owner = csw->owner; const struct consw *defcsw = NULL; struct con_driver *con_driver = NULL, *con_back = NULL; int i, retval = -ENODEV; if (!try_module_get(owner)) return -ENODEV; WARN_CONSOLE_UNLOCKED(); /* check if driver is registered and if it is unbindable */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = &registered_con_driver[i]; if (con_driver->con == csw && con_driver->flag & CON_DRIVER_FLAG_MODULE) { retval = 0; break; } } if (retval) goto err; retval = -ENODEV; /* check if backup driver exists */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_back = &registered_con_driver[i]; if (con_back->con && con_back->con != csw) { defcsw = con_back->con; retval = 0; break; } } if (retval) goto err; if (!con_is_bound(csw)) goto err; first = max(first, con_driver->first); last = min(last, con_driver->last); for (i = first; i <= last; i++) { if (con_driver_map[i] == csw) { module_put(csw->owner); con_driver_map[i] = NULL; } } if (!con_is_bound(defcsw)) { const struct consw *defconsw = conswitchp; defcsw->con_startup(); con_back->flag |= CON_DRIVER_FLAG_INIT; /* * vgacon may change the default driver to point * to dummycon, we restore it here... */ conswitchp = defconsw; } if (!con_is_bound(csw)) con_driver->flag &= ~CON_DRIVER_FLAG_INIT; /* ignore return value, binding should not fail */ do_bind_con_driver(defcsw, first, last, deflt); err: module_put(owner); return retval; } EXPORT_SYMBOL_GPL(do_unbind_con_driver); static int vt_bind(struct con_driver *con) { const struct consw *defcsw = NULL, *csw = NULL; int i, more = 1, first = -1, last = -1, deflt = 0; if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE)) goto err; csw = con->con; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con = &registered_con_driver[i]; if (con->con && !(con->flag & CON_DRIVER_FLAG_MODULE)) { defcsw = con->con; break; } } if (!defcsw) goto err; while (more) { more = 0; for (i = con->first; i <= con->last; i++) { if (con_driver_map[i] == defcsw) { if (first == -1) first = i; last = i; more = 1; } else if (first != -1) break; } if (first == 0 && last == MAX_NR_CONSOLES -1) deflt = 1; if (first != -1) do_bind_con_driver(csw, first, last, deflt); first = -1; last = -1; deflt = 0; } err: return 0; } static int vt_unbind(struct con_driver *con) { const struct consw *csw = NULL; int i, more = 1, first = -1, last = -1, deflt = 0; int ret; if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE)) goto err; csw = con->con; while (more) { more = 0; for (i = con->first; i <= con->last; i++) { if (con_driver_map[i] == csw) { if (first == -1) first = i; last = i; more = 1; } else if (first != -1) break; } if (first == 0 && last == MAX_NR_CONSOLES -1) deflt = 1; if (first != -1) { ret = do_unbind_con_driver(csw, first, last, deflt); if (ret != 0) return ret; } first = -1; last = -1; deflt = 0; } err: return 0; } #else static inline int vt_bind(struct con_driver *con) { return 0; } static inline int vt_unbind(struct con_driver *con) { return 0; } #endif /* CONFIG_VT_HW_CONSOLE_BINDING */ static ssize_t store_bind(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct con_driver *con = dev_get_drvdata(dev); int bind = simple_strtoul(buf, NULL, 0); console_lock(); if (bind) vt_bind(con); else vt_unbind(con); console_unlock(); return count; } static ssize_t show_bind(struct device *dev, struct device_attribute *attr, char *buf) { struct con_driver *con = dev_get_drvdata(dev); int bind; console_lock(); bind = con_is_bound(con->con); console_unlock(); return sysfs_emit(buf, "%i\n", bind); } static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf) { struct con_driver *con = dev_get_drvdata(dev); return sysfs_emit(buf, "%s %s\n", (con->flag & CON_DRIVER_FLAG_MODULE) ? "(M)" : "(S)", con->desc); } static DEVICE_ATTR(bind, S_IRUGO|S_IWUSR, show_bind, store_bind); static DEVICE_ATTR(name, S_IRUGO, show_name, NULL); static struct attribute *con_dev_attrs[] = { &dev_attr_bind.attr, &dev_attr_name.attr, NULL }; ATTRIBUTE_GROUPS(con_dev); static int vtconsole_init_device(struct con_driver *con) { con->flag |= CON_DRIVER_FLAG_ATTR; return 0; } static void vtconsole_deinit_device(struct con_driver *con) { con->flag &= ~CON_DRIVER_FLAG_ATTR; } /** * con_is_bound - checks if driver is bound to the console * @csw: console driver * * RETURNS: zero if unbound, nonzero if bound * * Drivers can call this and if zero, they should release * all resources allocated on &consw.con_startup() */ int con_is_bound(const struct consw *csw) { int i, bound = 0; WARN_CONSOLE_UNLOCKED(); for (i = 0; i < MAX_NR_CONSOLES; i++) { if (con_driver_map[i] == csw) { bound = 1; break; } } return bound; } EXPORT_SYMBOL(con_is_bound); /** * con_is_visible - checks whether the current console is visible * @vc: virtual console * * RETURNS: zero if not visible, nonzero if visible */ bool con_is_visible(const struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); return *vc->vc_display_fg == vc; } EXPORT_SYMBOL(con_is_visible); /** * con_debug_enter - prepare the console for the kernel debugger * @vc: virtual console * * Called when the console is taken over by the kernel debugger, this * function needs to save the current console state, then put the console * into a state suitable for the kernel debugger. */ void con_debug_enter(struct vc_data *vc) { saved_fg_console = fg_console; saved_last_console = last_console; saved_want_console = want_console; saved_vc_mode = vc->vc_mode; saved_console_blanked = console_blanked; vc->vc_mode = KD_TEXT; console_blanked = 0; if (vc->vc_sw->con_debug_enter) vc->vc_sw->con_debug_enter(vc); #ifdef CONFIG_KGDB_KDB /* Set the initial LINES variable if it is not already set */ if (vc->vc_rows < 999) { int linecount; char lns[4]; const char *setargs[3] = { "set", "LINES", lns, }; if (kdbgetintenv(setargs[0], &linecount)) { snprintf(lns, 4, "%i", vc->vc_rows); kdb_set(2, setargs); } } if (vc->vc_cols < 999) { int colcount; char cols[4]; const char *setargs[3] = { "set", "COLUMNS", cols, }; if (kdbgetintenv(setargs[0], &colcount)) { snprintf(cols, 4, "%i", vc->vc_cols); kdb_set(2, setargs); } } #endif /* CONFIG_KGDB_KDB */ } EXPORT_SYMBOL_GPL(con_debug_enter); /** * con_debug_leave - restore console state * * Restore the console state to what it was before the kernel debugger * was invoked. */ void con_debug_leave(void) { struct vc_data *vc; fg_console = saved_fg_console; last_console = saved_last_console; want_console = saved_want_console; console_blanked = saved_console_blanked; vc_cons[fg_console].d->vc_mode = saved_vc_mode; vc = vc_cons[fg_console].d; if (vc->vc_sw->con_debug_leave) vc->vc_sw->con_debug_leave(vc); } EXPORT_SYMBOL_GPL(con_debug_leave); static int do_register_con_driver(const struct consw *csw, int first, int last) { struct module *owner = csw->owner; struct con_driver *con_driver; const char *desc; int i, retval; WARN_CONSOLE_UNLOCKED(); if (!try_module_get(owner)) return -ENODEV; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = &registered_con_driver[i]; /* already registered */ if (con_driver->con == csw) { retval = -EBUSY; goto err; } } desc = csw->con_startup(); if (!desc) { retval = -ENODEV; goto err; } retval = -EINVAL; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = &registered_con_driver[i]; if (con_driver->con == NULL && !(con_driver->flag & CON_DRIVER_FLAG_ZOMBIE)) { con_driver->con = csw; con_driver->desc = desc; con_driver->node = i; con_driver->flag = CON_DRIVER_FLAG_MODULE | CON_DRIVER_FLAG_INIT; con_driver->first = first; con_driver->last = last; retval = 0; break; } } if (retval) goto err; con_driver->dev = device_create_with_groups(&vtconsole_class, NULL, MKDEV(0, con_driver->node), con_driver, con_dev_groups, "vtcon%i", con_driver->node); if (IS_ERR(con_driver->dev)) { pr_warn("Unable to create device for %s; errno = %ld\n", con_driver->desc, PTR_ERR(con_driver->dev)); con_driver->dev = NULL; } else { vtconsole_init_device(con_driver); } err: module_put(owner); return retval; } /** * do_unregister_con_driver - unregister console driver from console layer * @csw: console driver * * DESCRIPTION: All drivers that registers to the console layer must * call this function upon exit, or if the console driver is in a state * where it won't be able to handle console services, such as the * framebuffer console without loaded framebuffer drivers. * * The driver must unbind first prior to unregistration. */ int do_unregister_con_driver(const struct consw *csw) { int i; /* cannot unregister a bound driver */ if (con_is_bound(csw)) return -EBUSY; if (csw == conswitchp) return -EINVAL; for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = &registered_con_driver[i]; if (con_driver->con == csw) { /* * Defer the removal of the sysfs entries since that * will acquire the kernfs s_active lock and we can't * acquire this lock while holding the console lock: * the unbind sysfs entry imposes already the opposite * order. Reset con already here to prevent any later * lookup to succeed and mark this slot as zombie, so * it won't get reused until we complete the removal * in the deferred work. */ con_driver->con = NULL; con_driver->flag = CON_DRIVER_FLAG_ZOMBIE; schedule_work(&con_driver_unregister_work); return 0; } } return -ENODEV; } EXPORT_SYMBOL_GPL(do_unregister_con_driver); static void con_driver_unregister_callback(struct work_struct *ignored) { int i; console_lock(); for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con_driver = &registered_con_driver[i]; if (!(con_driver->flag & CON_DRIVER_FLAG_ZOMBIE)) continue; console_unlock(); vtconsole_deinit_device(con_driver); device_destroy(&vtconsole_class, MKDEV(0, con_driver->node)); console_lock(); if (WARN_ON_ONCE(con_driver->con)) con_driver->con = NULL; con_driver->desc = NULL; con_driver->dev = NULL; con_driver->node = 0; WARN_ON_ONCE(con_driver->flag != CON_DRIVER_FLAG_ZOMBIE); con_driver->flag = 0; con_driver->first = 0; con_driver->last = 0; } console_unlock(); } /* * If we support more console drivers, this function is used * when a driver wants to take over some existing consoles * and become default driver for newly opened ones. * * do_take_over_console is basically a register followed by bind */ int do_take_over_console(const struct consw *csw, int first, int last, int deflt) { int err; err = do_register_con_driver(csw, first, last); /* * If we get an busy error we still want to bind the console driver * and return success, as we may have unbound the console driver * but not unregistered it. */ if (err == -EBUSY) err = 0; if (!err) do_bind_con_driver(csw, first, last, deflt); return err; } EXPORT_SYMBOL_GPL(do_take_over_console); /* * give_up_console is a wrapper to unregister_con_driver. It will only * work if driver is fully unbound. */ void give_up_console(const struct consw *csw) { console_lock(); do_unregister_con_driver(csw); console_unlock(); } EXPORT_SYMBOL(give_up_console); static int __init vtconsole_class_init(void) { int i; i = class_register(&vtconsole_class); if (i) pr_warn("Unable to create vt console class; errno = %d\n", i); /* Add system drivers to sysfs */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { struct con_driver *con = &registered_con_driver[i]; if (con->con && !con->dev) { con->dev = device_create_with_groups(&vtconsole_class, NULL, MKDEV(0, con->node), con, con_dev_groups, "vtcon%i", con->node); if (IS_ERR(con->dev)) { pr_warn("Unable to create device for %s; errno = %ld\n", con->desc, PTR_ERR(con->dev)); con->dev = NULL; } else { vtconsole_init_device(con); } } } return 0; } postcore_initcall(vtconsole_class_init); /* * Screen blanking */ static int set_vesa_blanking(u8 __user *mode_user) { u8 mode; if (get_user(mode, mode_user)) return -EFAULT; console_lock(); vesa_blank_mode = (mode <= VESA_BLANK_MAX) ? mode : VESA_NO_BLANKING; console_unlock(); return 0; } void do_blank_screen(int entering_gfx) { struct vc_data *vc = vc_cons[fg_console].d; int i; might_sleep(); WARN_CONSOLE_UNLOCKED(); if (console_blanked) { if (blank_state == blank_vesa_wait) { blank_state = blank_off; vc->vc_sw->con_blank(vc, vesa_blank_mode + 1, 0); } return; } /* entering graphics mode? */ if (entering_gfx) { hide_cursor(vc); save_screen(vc); vc->vc_sw->con_blank(vc, VESA_VSYNC_SUSPEND, 1); console_blanked = fg_console + 1; blank_state = blank_off; set_origin(vc); return; } blank_state = blank_off; /* don't blank graphics */ if (vc->vc_mode != KD_TEXT) { console_blanked = fg_console + 1; return; } hide_cursor(vc); timer_delete_sync(&console_timer); blank_timer_expired = 0; save_screen(vc); /* In case we need to reset origin, blanking hook returns 1 */ i = vc->vc_sw->con_blank(vc, vesa_off_interval ? VESA_VSYNC_SUSPEND : (vesa_blank_mode + 1), 0); console_blanked = fg_console + 1; if (i) set_origin(vc); if (console_blank_hook && console_blank_hook(1)) return; if (vesa_off_interval && vesa_blank_mode) { blank_state = blank_vesa_wait; mod_timer(&console_timer, jiffies + vesa_off_interval); } vt_event_post(VT_EVENT_BLANK, vc->vc_num, vc->vc_num); } EXPORT_SYMBOL(do_blank_screen); /* * Called by timer as well as from vt_console_driver */ void do_unblank_screen(int leaving_gfx) { struct vc_data *vc; /* This should now always be called from a "sane" (read: can schedule) * context for the sake of the low level drivers, except in the special * case of oops_in_progress */ if (!oops_in_progress) might_sleep(); WARN_CONSOLE_UNLOCKED(); ignore_poke = 0; if (!console_blanked) return; if (!vc_cons_allocated(fg_console)) { /* impossible */ pr_warn("unblank_screen: tty %d not allocated ??\n", fg_console + 1); return; } vc = vc_cons[fg_console].d; if (vc->vc_mode != KD_TEXT) return; /* but leave console_blanked != 0 */ if (blankinterval) { mod_timer(&console_timer, jiffies + (blankinterval * HZ)); blank_state = blank_normal_wait; } console_blanked = 0; if (vc->vc_sw->con_blank(vc, VESA_NO_BLANKING, leaving_gfx)) /* Low-level driver cannot restore -> do it ourselves */ update_screen(vc); if (console_blank_hook) console_blank_hook(0); set_palette(vc); set_cursor(vc); vt_event_post(VT_EVENT_UNBLANK, vc->vc_num, vc->vc_num); } EXPORT_SYMBOL(do_unblank_screen); /* * This is called by the outside world to cause a forced unblank, mostly for * oopses. Currently, I just call do_unblank_screen(0), but we could eventually * call it with 1 as an argument and so force a mode restore... that may kill * X or at least garbage the screen but would also make the Oops visible... */ static void unblank_screen(void) { do_unblank_screen(0); } /* * We defer the timer blanking to work queue so it can take the console mutex * (console operations can still happen at irq time, but only from printk which * has the console mutex. Not perfect yet, but better than no locking */ static void blank_screen_t(struct timer_list *unused) { blank_timer_expired = 1; schedule_work(&console_work); } void poke_blanked_console(void) { WARN_CONSOLE_UNLOCKED(); /* Add this so we quickly catch whoever might call us in a non * safe context. Nowadays, unblank_screen() isn't to be called in * atomic contexts and is allowed to schedule (with the special case * of oops_in_progress, but that isn't of any concern for this * function. --BenH. */ might_sleep(); /* This isn't perfectly race free, but a race here would be mostly harmless, * at worst, we'll do a spurious blank and it's unlikely */ timer_delete(&console_timer); blank_timer_expired = 0; if (ignore_poke || !vc_cons[fg_console].d || vc_cons[fg_console].d->vc_mode == KD_GRAPHICS) return; if (console_blanked) unblank_screen(); else if (blankinterval) { mod_timer(&console_timer, jiffies + (blankinterval * HZ)); blank_state = blank_normal_wait; } } /* * Palettes */ static void set_palette(struct vc_data *vc) { WARN_CONSOLE_UNLOCKED(); if (vc->vc_mode != KD_GRAPHICS && vc->vc_sw->con_set_palette) vc->vc_sw->con_set_palette(vc, color_table); } /* * Load palette into the DAC registers. arg points to a colour * map, 3 bytes per colour, 16 colours, range from 0 to 255. */ int con_set_cmap(unsigned char __user *arg) { int i, j, k; unsigned char colormap[3*16]; if (copy_from_user(colormap, arg, sizeof(colormap))) return -EFAULT; console_lock(); for (i = k = 0; i < 16; i++) { default_red[i] = colormap[k++]; default_grn[i] = colormap[k++]; default_blu[i] = colormap[k++]; } for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons_allocated(i)) continue; for (j = k = 0; j < 16; j++) { vc_cons[i].d->vc_palette[k++] = default_red[j]; vc_cons[i].d->vc_palette[k++] = default_grn[j]; vc_cons[i].d->vc_palette[k++] = default_blu[j]; } set_palette(vc_cons[i].d); } console_unlock(); return 0; } int con_get_cmap(unsigned char __user *arg) { int i, k; unsigned char colormap[3*16]; console_lock(); for (i = k = 0; i < 16; i++) { colormap[k++] = default_red[i]; colormap[k++] = default_grn[i]; colormap[k++] = default_blu[i]; } console_unlock(); if (copy_to_user(arg, colormap, sizeof(colormap))) return -EFAULT; return 0; } void reset_palette(struct vc_data *vc) { int j, k; for (j=k=0; j<16; j++) { vc->vc_palette[k++] = default_red[j]; vc->vc_palette[k++] = default_grn[j]; vc->vc_palette[k++] = default_blu[j]; } set_palette(vc); } /* * Font switching * * Currently we only support fonts up to 128 pixels wide, at a maximum height * of 128 pixels. Userspace fontdata may have to be stored with 32 bytes * (shorts/ints, depending on width) reserved for each character which is * kinda wasty, but this is done in order to maintain compatibility with the * EGA/VGA fonts. It is up to the actual low-level console-driver convert data * into its favorite format (maybe we should add a `fontoffset' field to the * `display' structure so we won't have to convert the fontdata all the time. * /Jes */ #define max_font_width 64 #define max_font_height 128 #define max_font_glyphs 512 #define max_font_size (max_font_glyphs*max_font_width*max_font_height) static int con_font_get(struct vc_data *vc, struct console_font_op *op) { struct console_font font; int rc = -EINVAL; int c; unsigned int vpitch = op->op == KD_FONT_OP_GET_TALL ? op->height : 32; if (vpitch > max_font_height) return -EINVAL; if (op->data) { font.data = kvzalloc(max_font_size, GFP_KERNEL); if (!font.data) return -ENOMEM; } else font.data = NULL; console_lock(); if (vc->vc_mode != KD_TEXT) rc = -EINVAL; else if (vc->vc_sw->con_font_get) rc = vc->vc_sw->con_font_get(vc, &font, vpitch); else rc = -ENOSYS; console_unlock(); if (rc) goto out; c = (font.width+7)/8 * vpitch * font.charcount; if (op->data && font.charcount > op->charcount) rc = -ENOSPC; if (font.width > op->width || font.height > op->height) rc = -ENOSPC; if (rc) goto out; op->height = font.height; op->width = font.width; op->charcount = font.charcount; if (op->data && copy_to_user(op->data, font.data, c)) rc = -EFAULT; out: kvfree(font.data); return rc; } static int con_font_set(struct vc_data *vc, const struct console_font_op *op) { struct console_font font; int rc = -EINVAL; int size; unsigned int vpitch = op->op == KD_FONT_OP_SET_TALL ? op->height : 32; if (vc->vc_mode != KD_TEXT) return -EINVAL; if (!op->data) return -EINVAL; if (op->charcount > max_font_glyphs) return -EINVAL; if (op->width <= 0 || op->width > max_font_width || !op->height || op->height > max_font_height) return -EINVAL; if (vpitch < op->height) return -EINVAL; size = (op->width+7)/8 * vpitch * op->charcount; if (size > max_font_size) return -ENOSPC; font.data = memdup_user(op->data, size); if (IS_ERR(font.data)) return PTR_ERR(font.data); font.charcount = op->charcount; font.width = op->width; font.height = op->height; console_lock(); if (vc->vc_mode != KD_TEXT) rc = -EINVAL; else if (vc->vc_sw->con_font_set) { if (vc_is_sel(vc)) clear_selection(); rc = vc->vc_sw->con_font_set(vc, &font, vpitch, op->flags); } else rc = -ENOSYS; console_unlock(); kfree(font.data); return rc; } static int con_font_default(struct vc_data *vc, struct console_font_op *op) { struct console_font font = {.width = op->width, .height = op->height}; char name[MAX_FONT_NAME]; char *s = name; int rc; if (!op->data) s = NULL; else if (strncpy_from_user(name, op->data, MAX_FONT_NAME - 1) < 0) return -EFAULT; else name[MAX_FONT_NAME - 1] = 0; console_lock(); if (vc->vc_mode != KD_TEXT) { console_unlock(); return -EINVAL; } if (vc->vc_sw->con_font_default) { if (vc_is_sel(vc)) clear_selection(); rc = vc->vc_sw->con_font_default(vc, &font, s); } else rc = -ENOSYS; console_unlock(); if (!rc) { op->width = font.width; op->height = font.height; } return rc; } int con_font_op(struct vc_data *vc, struct console_font_op *op) { switch (op->op) { case KD_FONT_OP_SET: case KD_FONT_OP_SET_TALL: return con_font_set(vc, op); case KD_FONT_OP_GET: case KD_FONT_OP_GET_TALL: return con_font_get(vc, op); case KD_FONT_OP_SET_DEFAULT: return con_font_default(vc, op); case KD_FONT_OP_COPY: /* was buggy and never really used */ return -EINVAL; } return -ENOSYS; } /* * Interface exported to selection and vcs. */ /* used by selection */ u16 screen_glyph(const struct vc_data *vc, int offset) { u16 w = scr_readw(screenpos(vc, offset, true)); u16 c = w & 0xff; if (w & vc->vc_hi_font_mask) c |= 0x100; return c; } EXPORT_SYMBOL_GPL(screen_glyph); u32 screen_glyph_unicode(const struct vc_data *vc, int n) { u32 **uni_lines = vc->vc_uni_lines; if (uni_lines) return uni_lines[n / vc->vc_cols][n % vc->vc_cols]; return inverse_translate(vc, screen_glyph(vc, n * 2), true); } EXPORT_SYMBOL_GPL(screen_glyph_unicode); /* used by vcs - note the word offset */ unsigned short *screen_pos(const struct vc_data *vc, int w_offset, bool viewed) { return screenpos(vc, 2 * w_offset, viewed); } EXPORT_SYMBOL_GPL(screen_pos); void getconsxy(const struct vc_data *vc, unsigned char xy[static 2]) { /* clamp values if they don't fit */ xy[0] = min(vc->state.x, 0xFFu); xy[1] = min(vc->state.y, 0xFFu); } void putconsxy(struct vc_data *vc, unsigned char xy[static const 2]) { hide_cursor(vc); gotoxy(vc, xy[0], xy[1]); set_cursor(vc); } u16 vcs_scr_readw(const struct vc_data *vc, const u16 *org) { if ((unsigned long)org == vc->vc_pos && softcursor_original != -1) return softcursor_original; return scr_readw(org); } void vcs_scr_writew(struct vc_data *vc, u16 val, u16 *org) { scr_writew(val, org); if ((unsigned long)org == vc->vc_pos) { softcursor_original = -1; add_softcursor(vc); } } void vcs_scr_updated(struct vc_data *vc) { notify_update(vc); }
3993 1641 3972 2501 3992 51 128 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is an implementation of the BLAKE2s hash and PRF functions. * * Information: https://blake2.net/ * */ #include <crypto/internal/blake2s.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bug.h> static inline void blake2s_set_lastblock(struct blake2s_state *state) { state->f[0] = -1; } void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) { const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; if (unlikely(!inlen)) return; if (inlen > fill) { memcpy(state->buf + state->buflen, in, fill); blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); state->buflen = 0; in += fill; inlen -= fill; } if (inlen > BLAKE2S_BLOCK_SIZE) { const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); } memcpy(state->buf + state->buflen, in, inlen); state->buflen += inlen; } EXPORT_SYMBOL(blake2s_update); void blake2s_final(struct blake2s_state *state, u8 *out) { WARN_ON(IS_ENABLED(DEBUG) && !out); blake2s_set_lastblock(state); memset(state->buf + state->buflen, 0, BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ blake2s_compress(state, state->buf, 1, state->buflen); cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); memcpy(out, state->h, state->outlen); memzero_explicit(state, sizeof(*state)); } EXPORT_SYMBOL(blake2s_final); static int __init blake2s_mod_init(void) { if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) && WARN_ON(!blake2s_selftest())) return -ENODEV; return 0; } module_init(blake2s_mod_init); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
97 78 60 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* Null security operations. * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <net/af_rxrpc.h> #include "ar-internal.h" static int none_init_connection_security(struct rxrpc_connection *conn, struct rxrpc_key_token *token) { return 0; } /* * Allocate an appropriately sized buffer for the amount of data remaining. */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { txb->pkt_len = txb->len; if (txb->len == RXRPC_JUMBO_DATALEN) txb->jumboable = true; return 0; } static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); sp->flags |= RXRPC_RX_VERIFIED; return 0; } static void none_free_call_crypto(struct rxrpc_call *call) { } static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); } static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_response); } static void none_clear(struct rxrpc_connection *conn) { } static int none_init(void) { return 0; } static void none_exit(void) { } /* * RxRPC Kerberos-based security */ const struct rxrpc_security rxrpc_no_security = { .name = "none", .security_index = RXRPC_SECURITY_NONE, .init = none_init, .exit = none_exit, .init_connection_security = none_init_connection_security, .free_call_crypto = none_free_call_crypto, .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .respond_to_challenge = none_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, };
2 2 2 2 2 8 8 50 50 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 // SPDX-License-Identifier: GPL-2.0-only #include <linux/if_bridge.h> #include <linux/in.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rhashtable.h> #include <linux/rhashtable-types.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/netlink.h> #include <net/vxlan.h> #include "vxlan_private.h" struct vxlan_mdb_entry_key { union vxlan_addr src; union vxlan_addr dst; __be32 vni; }; struct vxlan_mdb_entry { struct rhash_head rhnode; struct list_head remotes; struct vxlan_mdb_entry_key key; struct hlist_node mdb_node; struct rcu_head rcu; }; #define VXLAN_MDB_REMOTE_F_BLOCKED BIT(0) struct vxlan_mdb_remote { struct list_head list; struct vxlan_rdst __rcu *rd; u8 flags; u8 filter_mode; u8 rt_protocol; struct hlist_head src_list; struct rcu_head rcu; }; #define VXLAN_SGRP_F_DELETE BIT(0) struct vxlan_mdb_src_entry { struct hlist_node node; union vxlan_addr addr; u8 flags; }; struct vxlan_mdb_dump_ctx { long reserved; long entry_idx; long remote_idx; }; struct vxlan_mdb_config_src_entry { union vxlan_addr addr; struct list_head node; }; struct vxlan_mdb_config { struct vxlan_dev *vxlan; struct vxlan_mdb_entry_key group; struct list_head src_list; union vxlan_addr remote_ip; u32 remote_ifindex; __be32 remote_vni; __be16 remote_port; u16 nlflags; u8 flags; u8 filter_mode; u8 rt_protocol; }; struct vxlan_mdb_flush_desc { union vxlan_addr remote_ip; __be32 src_vni; __be32 remote_vni; __be16 remote_port; u8 rt_protocol; }; static const struct rhashtable_params vxlan_mdb_rht_params = { .head_offset = offsetof(struct vxlan_mdb_entry, rhnode), .key_offset = offsetof(struct vxlan_mdb_entry, key), .key_len = sizeof(struct vxlan_mdb_entry_key), .automatic_shrinking = true, }; static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg, struct netlink_ext_ack *extack); static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg, struct netlink_ext_ack *extack); static void vxlan_br_mdb_entry_fill(const struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote, struct br_mdb_entry *e) { const union vxlan_addr *dst = &mdb_entry->key.dst; memset(e, 0, sizeof(*e)); e->ifindex = vxlan->dev->ifindex; e->state = MDB_PERMANENT; if (remote->flags & VXLAN_MDB_REMOTE_F_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; switch (dst->sa.sa_family) { case AF_INET: e->addr.u.ip4 = dst->sin.sin_addr.s_addr; e->addr.proto = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: e->addr.u.ip6 = dst->sin6.sin6_addr; e->addr.proto = htons(ETH_P_IPV6); break; #endif } } static int vxlan_mdb_entry_info_fill_srcs(struct sk_buff *skb, const struct vxlan_mdb_remote *remote) { struct vxlan_mdb_src_entry *ent; struct nlattr *nest; if (hlist_empty(&remote->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry(ent, &remote->src_list, node) { struct nlattr *nest_ent; nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; if (vxlan_nla_put_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr) || nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, 0)) goto out_cancel_err; nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int vxlan_mdb_entry_info_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote) { struct vxlan_rdst *rd = rtnl_dereference(remote->rd); struct br_mdb_entry e; struct nlattr *nest; nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest) return -EMSGSIZE; vxlan_br_mdb_entry_fill(vxlan, mdb_entry, remote, &e); if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, 0)) goto nest_err; if (!vxlan_addr_any(&mdb_entry->key.src) && vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_SOURCE, &mdb_entry->key.src)) goto nest_err; if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, remote->rt_protocol) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, remote->filter_mode) || vxlan_mdb_entry_info_fill_srcs(skb, remote) || vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_DST, &rd->remote_ip)) goto nest_err; if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port && nla_put_u16(skb, MDBA_MDB_EATTR_DST_PORT, be16_to_cpu(rd->remote_port))) goto nest_err; if (rd->remote_vni != vxlan->default_dst.remote_vni && nla_put_u32(skb, MDBA_MDB_EATTR_VNI, be32_to_cpu(rd->remote_vni))) goto nest_err; if (rd->remote_ifindex && nla_put_u32(skb, MDBA_MDB_EATTR_IFINDEX, rd->remote_ifindex)) goto nest_err; if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && mdb_entry->key.vni && nla_put_u32(skb, MDBA_MDB_EATTR_SRC_VNI, be32_to_cpu(mdb_entry->key.vni))) goto nest_err; nla_nest_end(skb, nest); return 0; nest_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int vxlan_mdb_entry_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb, struct vxlan_mdb_dump_ctx *ctx, const struct vxlan_mdb_entry *mdb_entry) { int remote_idx = 0, s_remote_idx = ctx->remote_idx; struct vxlan_mdb_remote *remote; struct nlattr *nest; int err = 0; nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest) return -EMSGSIZE; list_for_each_entry(remote, &mdb_entry->remotes, list) { if (remote_idx < s_remote_idx) goto skip; err = vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote); if (err) break; skip: remote_idx++; } ctx->remote_idx = err ? remote_idx : 0; nla_nest_end(skb, nest); return err; } static int vxlan_mdb_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb, struct vxlan_mdb_dump_ctx *ctx) { int entry_idx = 0, s_entry_idx = ctx->entry_idx; struct vxlan_mdb_entry *mdb_entry; struct nlattr *nest; int err = 0; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!nest) return -EMSGSIZE; hlist_for_each_entry(mdb_entry, &vxlan->mdb_list, mdb_node) { if (entry_idx < s_entry_idx) goto skip; err = vxlan_mdb_entry_fill(vxlan, skb, ctx, mdb_entry); if (err) break; skip: entry_idx++; } ctx->entry_idx = err ? entry_idx : 0; nla_nest_end(skb, nest); return err; } int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct vxlan_mdb_dump_ctx *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; ASSERT_RTNL(); NL_ASSERT_CTX_FITS(struct vxlan_mdb_dump_ctx); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; err = vxlan_mdb_fill(vxlan, skb, ctx); nlmsg_end(skb, nlh); cb->seq = vxlan->mdb_seq; nl_dump_check_consistent(cb, nlh); return err; } static const struct nla_policy vxlan_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static const struct nla_policy vxlan_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_entry_pol), }; static const struct netlink_range_validation vni_range = { .max = VXLAN_N_VID - 1, }; static const struct nla_policy vxlan_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_pol), [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_DST_PORT] = { .type = NLA_U16 }, [MDBE_ATTR_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range), [MDBE_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [MDBE_ATTR_SRC_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range), }; static bool vxlan_mdb_is_valid_source(const struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static void vxlan_mdb_group_set(struct vxlan_mdb_entry_key *group, const struct br_mdb_entry *entry, const struct nlattr *source_attr) { switch (entry->addr.proto) { case htons(ETH_P_IP): group->dst.sa.sa_family = AF_INET; group->dst.sin.sin_addr.s_addr = entry->addr.u.ip4; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): group->dst.sa.sa_family = AF_INET6; group->dst.sin6.sin6_addr = entry->addr.u.ip6; break; #endif } if (source_attr) vxlan_nla_get_addr(&group->src, source_attr); } static bool vxlan_mdb_is_star_g(const struct vxlan_mdb_entry_key *group) { return !vxlan_addr_any(&group->dst) && vxlan_addr_any(&group->src); } static bool vxlan_mdb_is_sg(const struct vxlan_mdb_entry_key *group) { return !vxlan_addr_any(&group->dst) && !vxlan_addr_any(&group->src); } static int vxlan_mdb_config_src_entry_init(struct vxlan_mdb_config *cfg, __be16 proto, const struct nlattr *src_entry, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; struct vxlan_mdb_config_src_entry *src; int err; err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, vxlan_mdbe_src_list_entry_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) return -EINVAL; if (!vxlan_mdb_is_valid_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) return -EINVAL; src = kzalloc(sizeof(*src), GFP_KERNEL); if (!src) return -ENOMEM; err = vxlan_nla_get_addr(&src->addr, tb[MDBE_SRCATTR_ADDRESS]); if (err) goto err_free_src; list_add_tail(&src->node, &cfg->src_list); return 0; err_free_src: kfree(src); return err; } static void vxlan_mdb_config_src_entry_fini(struct vxlan_mdb_config_src_entry *src) { list_del(&src->node); kfree(src); } static int vxlan_mdb_config_src_list_init(struct vxlan_mdb_config *cfg, __be16 proto, const struct nlattr *src_list, struct netlink_ext_ack *extack) { struct vxlan_mdb_config_src_entry *src, *tmp; struct nlattr *src_entry; int rem, err; nla_for_each_nested(src_entry, src_list, rem) { err = vxlan_mdb_config_src_entry_init(cfg, proto, src_entry, extack); if (err) goto err_src_entry_init; } return 0; err_src_entry_init: list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node) vxlan_mdb_config_src_entry_fini(src); return err; } static void vxlan_mdb_config_src_list_fini(struct vxlan_mdb_config *cfg) { struct vxlan_mdb_config_src_entry *src, *tmp; list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node) vxlan_mdb_config_src_entry_fini(src); } static int vxlan_mdb_config_attrs_init(struct vxlan_mdb_config *cfg, const struct br_mdb_entry *entry, const struct nlattr *set_attrs, struct netlink_ext_ack *extack) { struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, set_attrs, vxlan_mdbe_attrs_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, set_attrs, mdbe_attrs, MDBE_ATTR_DST)) { NL_SET_ERR_MSG_MOD(extack, "Missing remote destination IP address"); return -EINVAL; } if (mdbe_attrs[MDBE_ATTR_SOURCE] && !vxlan_mdb_is_valid_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; vxlan_mdb_group_set(&cfg->group, entry, mdbe_attrs[MDBE_ATTR_SOURCE]); /* rtnetlink code only validates that IPv4 group address is * multicast. */ if (!vxlan_addr_is_multicast(&cfg->group.dst) && !vxlan_addr_any(&cfg->group.dst)) { NL_SET_ERR_MSG_MOD(extack, "Group address is not multicast"); return -EINVAL; } if (vxlan_addr_any(&cfg->group.dst) && mdbe_attrs[MDBE_ATTR_SOURCE]) { NL_SET_ERR_MSG_MOD(extack, "Source cannot be specified for the all-zeros entry"); return -EINVAL; } if (vxlan_mdb_is_sg(&cfg->group)) cfg->filter_mode = MCAST_INCLUDE; if (mdbe_attrs[MDBE_ATTR_GROUP_MODE]) { if (!vxlan_mdb_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); return -EINVAL; } cfg->filter_mode = nla_get_u8(mdbe_attrs[MDBE_ATTR_GROUP_MODE]); } if (mdbe_attrs[MDBE_ATTR_SRC_LIST]) { if (!vxlan_mdb_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); return -EINVAL; } if (!mdbe_attrs[MDBE_ATTR_GROUP_MODE]) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); return -EINVAL; } err = vxlan_mdb_config_src_list_init(cfg, entry->addr.proto, mdbe_attrs[MDBE_ATTR_SRC_LIST], extack); if (err) return err; } if (vxlan_mdb_is_star_g(&cfg->group) && list_empty(&cfg->src_list) && cfg->filter_mode == MCAST_INCLUDE) { NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); return -EINVAL; } if (mdbe_attrs[MDBE_ATTR_RTPROT]) cfg->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); err = vxlan_nla_get_addr(&cfg->remote_ip, mdbe_attrs[MDBE_ATTR_DST]); if (err) { NL_SET_ERR_MSG_MOD(extack, "Invalid remote destination address"); goto err_src_list_fini; } if (mdbe_attrs[MDBE_ATTR_DST_PORT]) cfg->remote_port = cpu_to_be16(nla_get_u16(mdbe_attrs[MDBE_ATTR_DST_PORT])); if (mdbe_attrs[MDBE_ATTR_VNI]) cfg->remote_vni = cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_VNI])); if (mdbe_attrs[MDBE_ATTR_IFINDEX]) { cfg->remote_ifindex = nla_get_s32(mdbe_attrs[MDBE_ATTR_IFINDEX]); if (!__dev_get_by_index(cfg->vxlan->net, cfg->remote_ifindex)) { NL_SET_ERR_MSG_MOD(extack, "Outgoing interface not found"); err = -EINVAL; goto err_src_list_fini; } } if (mdbe_attrs[MDBE_ATTR_SRC_VNI]) cfg->group.vni = cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_SRC_VNI])); return 0; err_src_list_fini: vxlan_mdb_config_src_list_fini(cfg); return err; } static int vxlan_mdb_config_init(struct vxlan_mdb_config *cfg, struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct vxlan_dev *vxlan = netdev_priv(dev); memset(cfg, 0, sizeof(*cfg)); cfg->vxlan = vxlan; cfg->group.vni = vxlan->default_dst.remote_vni; INIT_LIST_HEAD(&cfg->src_list); cfg->nlflags = nlmsg_flags; cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; cfg->remote_vni = vxlan->default_dst.remote_vni; cfg->remote_port = vxlan->cfg.dst_port; if (entry->ifindex != dev->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Port net device must be the VXLAN net device"); return -EINVAL; } /* State is not part of the entry key and can be ignored on deletion * requests. */ if ((nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) && entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "MDB entry must be permanent"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid MDB entry flags"); return -EINVAL; } if (entry->vid) { NL_SET_ERR_MSG_MOD(extack, "VID must not be specified"); return -EINVAL; } if (entry->addr.proto != htons(ETH_P_IP) && entry->addr.proto != htons(ETH_P_IPV6)) { NL_SET_ERR_MSG_MOD(extack, "Group address must be an IPv4 / IPv6 address"); return -EINVAL; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY_ATTRS)) { NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY_ATTRS attribute"); return -EINVAL; } return vxlan_mdb_config_attrs_init(cfg, entry, tb[MDBA_SET_ENTRY_ATTRS], extack); } static void vxlan_mdb_config_fini(struct vxlan_mdb_config *cfg) { vxlan_mdb_config_src_list_fini(cfg); } static struct vxlan_mdb_entry * vxlan_mdb_entry_lookup(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry_key *group) { return rhashtable_lookup_fast(&vxlan->mdb_tbl, group, vxlan_mdb_rht_params); } static struct vxlan_mdb_remote * vxlan_mdb_remote_lookup(const struct vxlan_mdb_entry *mdb_entry, const union vxlan_addr *addr) { struct vxlan_mdb_remote *remote; list_for_each_entry(remote, &mdb_entry->remotes, list) { struct vxlan_rdst *rd = rtnl_dereference(remote->rd); if (vxlan_addr_equal(addr, &rd->remote_ip)) return remote; } return NULL; } static void vxlan_mdb_rdst_free(struct rcu_head *head) { struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); dst_cache_destroy(&rd->dst_cache); kfree(rd); } static int vxlan_mdb_remote_rdst_init(const struct vxlan_mdb_config *cfg, struct vxlan_mdb_remote *remote) { struct vxlan_rdst *rd; int err; rd = kzalloc(sizeof(*rd), GFP_KERNEL); if (!rd) return -ENOMEM; err = dst_cache_init(&rd->dst_cache, GFP_KERNEL); if (err) goto err_free_rdst; rd->remote_ip = cfg->remote_ip; rd->remote_port = cfg->remote_port; rd->remote_vni = cfg->remote_vni; rd->remote_ifindex = cfg->remote_ifindex; rcu_assign_pointer(remote->rd, rd); return 0; err_free_rdst: kfree(rd); return err; } static void vxlan_mdb_remote_rdst_fini(struct vxlan_rdst *rd) { call_rcu(&rd->rcu, vxlan_mdb_rdst_free); } static int vxlan_mdb_remote_init(const struct vxlan_mdb_config *cfg, struct vxlan_mdb_remote *remote) { int err; err = vxlan_mdb_remote_rdst_init(cfg, remote); if (err) return err; remote->flags = cfg->flags; remote->filter_mode = cfg->filter_mode; remote->rt_protocol = cfg->rt_protocol; INIT_HLIST_HEAD(&remote->src_list); return 0; } static void vxlan_mdb_remote_fini(struct vxlan_dev *vxlan, struct vxlan_mdb_remote *remote) { WARN_ON_ONCE(!hlist_empty(&remote->src_list)); vxlan_mdb_remote_rdst_fini(rtnl_dereference(remote->rd)); } static struct vxlan_mdb_src_entry * vxlan_mdb_remote_src_entry_lookup(const struct vxlan_mdb_remote *remote, const union vxlan_addr *addr) { struct vxlan_mdb_src_entry *ent; hlist_for_each_entry(ent, &remote->src_list, node) { if (vxlan_addr_equal(&ent->addr, addr)) return ent; } return NULL; } static struct vxlan_mdb_src_entry * vxlan_mdb_remote_src_entry_add(struct vxlan_mdb_remote *remote, const union vxlan_addr *addr) { struct vxlan_mdb_src_entry *ent; ent = kzalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return NULL; ent->addr = *addr; hlist_add_head(&ent->node, &remote->src_list); return ent; } static void vxlan_mdb_remote_src_entry_del(struct vxlan_mdb_src_entry *ent) { hlist_del(&ent->node); kfree(ent); } static int vxlan_mdb_remote_src_fwd_add(const struct vxlan_mdb_config *cfg, const union vxlan_addr *addr, struct netlink_ext_ack *extack) { struct vxlan_mdb_config sg_cfg; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.vxlan = cfg->vxlan; sg_cfg.group.src = *addr; sg_cfg.group.dst = cfg->group.dst; sg_cfg.group.vni = cfg->group.vni; INIT_LIST_HEAD(&sg_cfg.src_list); sg_cfg.remote_ip = cfg->remote_ip; sg_cfg.remote_ifindex = cfg->remote_ifindex; sg_cfg.remote_vni = cfg->remote_vni; sg_cfg.remote_port = cfg->remote_port; sg_cfg.nlflags = cfg->nlflags; sg_cfg.filter_mode = MCAST_INCLUDE; if (cfg->filter_mode == MCAST_EXCLUDE) sg_cfg.flags = VXLAN_MDB_REMOTE_F_BLOCKED; sg_cfg.rt_protocol = cfg->rt_protocol; return __vxlan_mdb_add(&sg_cfg, extack); } static void vxlan_mdb_remote_src_fwd_del(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry_key *group, const struct vxlan_mdb_remote *remote, const union vxlan_addr *addr) { struct vxlan_rdst *rd = rtnl_dereference(remote->rd); struct vxlan_mdb_config sg_cfg; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.vxlan = vxlan; sg_cfg.group.src = *addr; sg_cfg.group.dst = group->dst; sg_cfg.group.vni = group->vni; INIT_LIST_HEAD(&sg_cfg.src_list); sg_cfg.remote_ip = rd->remote_ip; __vxlan_mdb_del(&sg_cfg, NULL); } static int vxlan_mdb_remote_src_add(const struct vxlan_mdb_config *cfg, struct vxlan_mdb_remote *remote, const struct vxlan_mdb_config_src_entry *src, struct netlink_ext_ack *extack) { struct vxlan_mdb_src_entry *ent; int err; ent = vxlan_mdb_remote_src_entry_lookup(remote, &src->addr); if (!ent) { ent = vxlan_mdb_remote_src_entry_add(remote, &src->addr); if (!ent) return -ENOMEM; } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } err = vxlan_mdb_remote_src_fwd_add(cfg, &ent->addr, extack); if (err) goto err_src_del; /* Clear flags in case source entry was marked for deletion as part of * replace flow. */ ent->flags = 0; return 0; err_src_del: vxlan_mdb_remote_src_entry_del(ent); return err; } static void vxlan_mdb_remote_src_del(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry_key *group, const struct vxlan_mdb_remote *remote, struct vxlan_mdb_src_entry *ent) { vxlan_mdb_remote_src_fwd_del(vxlan, group, remote, &ent->addr); vxlan_mdb_remote_src_entry_del(ent); } static int vxlan_mdb_remote_srcs_add(const struct vxlan_mdb_config *cfg, struct vxlan_mdb_remote *remote, struct netlink_ext_ack *extack) { struct vxlan_mdb_config_src_entry *src; struct vxlan_mdb_src_entry *ent; struct hlist_node *tmp; int err; list_for_each_entry(src, &cfg->src_list, node) { err = vxlan_mdb_remote_src_add(cfg, remote, src, extack); if (err) goto err_src_del; } return 0; err_src_del: hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node) vxlan_mdb_remote_src_del(cfg->vxlan, &cfg->group, remote, ent); return err; } static void vxlan_mdb_remote_srcs_del(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry_key *group, struct vxlan_mdb_remote *remote) { struct vxlan_mdb_src_entry *ent; struct hlist_node *tmp; hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node) vxlan_mdb_remote_src_del(vxlan, group, remote, ent); } static size_t vxlan_mdb_nlmsg_src_list_size(const struct vxlan_mdb_entry_key *group, const struct vxlan_mdb_remote *remote) { struct vxlan_mdb_src_entry *ent; size_t nlmsg_size; if (hlist_empty(&remote->src_list)) return 0; /* MDBA_MDB_EATTR_SRC_LIST */ nlmsg_size = nla_total_size(0); hlist_for_each_entry(ent, &remote->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY */ nlmsg_size += nla_total_size(0) + /* MDBA_MDB_SRCATTR_ADDRESS */ nla_total_size(vxlan_addr_size(&group->dst)) + /* MDBA_MDB_SRCATTR_TIMER */ nla_total_size(sizeof(u8)); } return nlmsg_size; } static size_t vxlan_mdb_nlmsg_remote_size(const struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote) { const struct vxlan_mdb_entry_key *group = &mdb_entry->key; struct vxlan_rdst *rd = rtnl_dereference(remote->rd); size_t nlmsg_size; /* MDBA_MDB_ENTRY_INFO */ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) + /* MDBA_MDB_EATTR_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_MDB_EATTR_SOURCE */ if (vxlan_mdb_is_sg(group)) nlmsg_size += nla_total_size(vxlan_addr_size(&group->dst)); /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST */ nlmsg_size += vxlan_mdb_nlmsg_src_list_size(group, remote); /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_DST */ nlmsg_size += nla_total_size(vxlan_addr_size(&rd->remote_ip)); /* MDBA_MDB_EATTR_DST_PORT */ if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port) nlmsg_size += nla_total_size(sizeof(u16)); /* MDBA_MDB_EATTR_VNI */ if (rd->remote_vni != vxlan->default_dst.remote_vni) nlmsg_size += nla_total_size(sizeof(u32)); /* MDBA_MDB_EATTR_IFINDEX */ if (rd->remote_ifindex) nlmsg_size += nla_total_size(sizeof(u32)); /* MDBA_MDB_EATTR_SRC_VNI */ if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && group->vni) nlmsg_size += nla_total_size(sizeof(u32)); return nlmsg_size; } static size_t vxlan_mdb_nlmsg_size(const struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0) + /* Remote entry */ vxlan_mdb_nlmsg_remote_size(vxlan, mdb_entry, remote); } static int vxlan_mdb_nlmsg_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote, int type) { struct nlattr *mdb_nest, *mdb_entry_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = vxlan->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) goto cancel; mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) goto cancel; if (vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote)) goto cancel; nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void vxlan_mdb_remote_notify(const struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_remote *remote, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(vxlan_mdb_nlmsg_size(vxlan, mdb_entry, remote), GFP_KERNEL); if (!skb) goto errout; err = vxlan_mdb_nlmsg_fill(vxlan, skb, mdb_entry, remote, type); if (err) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static int vxlan_mdb_remote_srcs_replace(const struct vxlan_mdb_config *cfg, const struct vxlan_mdb_entry *mdb_entry, struct vxlan_mdb_remote *remote, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = cfg->vxlan; struct vxlan_mdb_src_entry *ent; struct hlist_node *tmp; int err; hlist_for_each_entry(ent, &remote->src_list, node) ent->flags |= VXLAN_SGRP_F_DELETE; err = vxlan_mdb_remote_srcs_add(cfg, remote, extack); if (err) goto err_clear_delete; hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node) { if (ent->flags & VXLAN_SGRP_F_DELETE) vxlan_mdb_remote_src_del(vxlan, &mdb_entry->key, remote, ent); } return 0; err_clear_delete: hlist_for_each_entry(ent, &remote->src_list, node) ent->flags &= ~VXLAN_SGRP_F_DELETE; return err; } static int vxlan_mdb_remote_replace(const struct vxlan_mdb_config *cfg, const struct vxlan_mdb_entry *mdb_entry, struct vxlan_mdb_remote *remote, struct netlink_ext_ack *extack) { struct vxlan_rdst *new_rd, *old_rd = rtnl_dereference(remote->rd); struct vxlan_dev *vxlan = cfg->vxlan; int err; err = vxlan_mdb_remote_rdst_init(cfg, remote); if (err) return err; new_rd = rtnl_dereference(remote->rd); err = vxlan_mdb_remote_srcs_replace(cfg, mdb_entry, remote, extack); if (err) goto err_rdst_reset; WRITE_ONCE(remote->flags, cfg->flags); WRITE_ONCE(remote->filter_mode, cfg->filter_mode); remote->rt_protocol = cfg->rt_protocol; vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_NEWMDB); vxlan_mdb_remote_rdst_fini(old_rd); return 0; err_rdst_reset: rcu_assign_pointer(remote->rd, old_rd); vxlan_mdb_remote_rdst_fini(new_rd); return err; } static int vxlan_mdb_remote_add(const struct vxlan_mdb_config *cfg, struct vxlan_mdb_entry *mdb_entry, struct netlink_ext_ack *extack) { struct vxlan_mdb_remote *remote; int err; remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip); if (remote) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Replace not specified and MDB remote entry already exists"); return -EEXIST; } return vxlan_mdb_remote_replace(cfg, mdb_entry, remote, extack); } if (!(cfg->nlflags & NLM_F_CREATE)) { NL_SET_ERR_MSG_MOD(extack, "Create not specified and entry does not exist"); return -ENOENT; } remote = kzalloc(sizeof(*remote), GFP_KERNEL); if (!remote) return -ENOMEM; err = vxlan_mdb_remote_init(cfg, remote); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to initialize remote MDB entry"); goto err_free_remote; } err = vxlan_mdb_remote_srcs_add(cfg, remote, extack); if (err) goto err_remote_fini; list_add_rcu(&remote->list, &mdb_entry->remotes); vxlan_mdb_remote_notify(cfg->vxlan, mdb_entry, remote, RTM_NEWMDB); return 0; err_remote_fini: vxlan_mdb_remote_fini(cfg->vxlan, remote); err_free_remote: kfree(remote); return err; } static void vxlan_mdb_remote_del(struct vxlan_dev *vxlan, struct vxlan_mdb_entry *mdb_entry, struct vxlan_mdb_remote *remote) { vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_DELMDB); list_del_rcu(&remote->list); vxlan_mdb_remote_srcs_del(vxlan, &mdb_entry->key, remote); vxlan_mdb_remote_fini(vxlan, remote); kfree_rcu(remote, rcu); } static struct vxlan_mdb_entry * vxlan_mdb_entry_get(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry_key *group) { struct vxlan_mdb_entry *mdb_entry; int err; mdb_entry = vxlan_mdb_entry_lookup(vxlan, group); if (mdb_entry) return mdb_entry; mdb_entry = kzalloc(sizeof(*mdb_entry), GFP_KERNEL); if (!mdb_entry) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&mdb_entry->remotes); memcpy(&mdb_entry->key, group, sizeof(mdb_entry->key)); hlist_add_head(&mdb_entry->mdb_node, &vxlan->mdb_list); err = rhashtable_lookup_insert_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode, vxlan_mdb_rht_params); if (err) goto err_free_entry; if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list)) vxlan->cfg.flags |= VXLAN_F_MDB; return mdb_entry; err_free_entry: hlist_del(&mdb_entry->mdb_node); kfree(mdb_entry); return ERR_PTR(err); } static void vxlan_mdb_entry_put(struct vxlan_dev *vxlan, struct vxlan_mdb_entry *mdb_entry) { if (!list_empty(&mdb_entry->remotes)) return; if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list)) vxlan->cfg.flags &= ~VXLAN_F_MDB; rhashtable_remove_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode, vxlan_mdb_rht_params); hlist_del(&mdb_entry->mdb_node); kfree_rcu(mdb_entry, rcu); } static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = cfg->vxlan; struct vxlan_mdb_entry *mdb_entry; int err; mdb_entry = vxlan_mdb_entry_get(vxlan, &cfg->group); if (IS_ERR(mdb_entry)) return PTR_ERR(mdb_entry); err = vxlan_mdb_remote_add(cfg, mdb_entry, extack); if (err) goto err_entry_put; vxlan->mdb_seq++; return 0; err_entry_put: vxlan_mdb_entry_put(vxlan, mdb_entry); return err; } static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = cfg->vxlan; struct vxlan_mdb_entry *mdb_entry; struct vxlan_mdb_remote *remote; mdb_entry = vxlan_mdb_entry_lookup(vxlan, &cfg->group); if (!mdb_entry) { NL_SET_ERR_MSG_MOD(extack, "Did not find MDB entry"); return -ENOENT; } remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip); if (!remote) { NL_SET_ERR_MSG_MOD(extack, "Did not find MDB remote entry"); return -ENOENT; } vxlan_mdb_remote_del(vxlan, mdb_entry, remote); vxlan_mdb_entry_put(vxlan, mdb_entry); vxlan->mdb_seq++; return 0; } int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct vxlan_mdb_config cfg; int err; ASSERT_RTNL(); err = vxlan_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack); if (err) return err; err = __vxlan_mdb_add(&cfg, extack); vxlan_mdb_config_fini(&cfg); return err; } int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct vxlan_mdb_config cfg; int err; ASSERT_RTNL(); err = vxlan_mdb_config_init(&cfg, dev, tb, 0, extack); if (err) return err; err = __vxlan_mdb_del(&cfg, extack); vxlan_mdb_config_fini(&cfg); return err; } static const struct nla_policy vxlan_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_DST_PORT] = { .type = NLA_U16 }, [MDBE_ATTR_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range), [MDBE_ATTR_SRC_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range), [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), }; static int vxlan_mdb_flush_desc_init(struct vxlan_dev *vxlan, struct vxlan_mdb_flush_desc *desc, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; if (entry->ifindex && entry->ifindex != vxlan->dev->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Invalid port net device"); return -EINVAL; } if (entry->vid) { NL_SET_ERR_MSG_MOD(extack, "VID must not be specified"); return -EINVAL; } if (!tb[MDBA_SET_ENTRY_ATTRS]) return 0; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], vxlan_mdbe_attrs_del_bulk_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) { u8 state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); if ((state_mask & MDB_PERMANENT) && !(entry->state & MDB_PERMANENT)) { NL_SET_ERR_MSG_MOD(extack, "Only permanent MDB entries are supported"); return -EINVAL; } } if (mdbe_attrs[MDBE_ATTR_RTPROT]) desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); if (mdbe_attrs[MDBE_ATTR_DST]) vxlan_nla_get_addr(&desc->remote_ip, mdbe_attrs[MDBE_ATTR_DST]); if (mdbe_attrs[MDBE_ATTR_DST_PORT]) desc->remote_port = cpu_to_be16(nla_get_u16(mdbe_attrs[MDBE_ATTR_DST_PORT])); if (mdbe_attrs[MDBE_ATTR_VNI]) desc->remote_vni = cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_VNI])); if (mdbe_attrs[MDBE_ATTR_SRC_VNI]) desc->src_vni = cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_SRC_VNI])); return 0; } static void vxlan_mdb_remotes_flush(struct vxlan_dev *vxlan, struct vxlan_mdb_entry *mdb_entry, const struct vxlan_mdb_flush_desc *desc) { struct vxlan_mdb_remote *remote, *tmp; list_for_each_entry_safe(remote, tmp, &mdb_entry->remotes, list) { struct vxlan_rdst *rd = rtnl_dereference(remote->rd); __be32 remote_vni; if (desc->remote_ip.sa.sa_family && !vxlan_addr_equal(&desc->remote_ip, &rd->remote_ip)) continue; /* Encapsulation is performed with source VNI if remote VNI * is not set. */ remote_vni = rd->remote_vni ? : mdb_entry->key.vni; if (desc->remote_vni && desc->remote_vni != remote_vni) continue; if (desc->remote_port && desc->remote_port != rd->remote_port) continue; if (desc->rt_protocol && desc->rt_protocol != remote->rt_protocol) continue; vxlan_mdb_remote_del(vxlan, mdb_entry, remote); } } static void vxlan_mdb_flush(struct vxlan_dev *vxlan, const struct vxlan_mdb_flush_desc *desc) { struct vxlan_mdb_entry *mdb_entry; struct hlist_node *tmp; /* The removal of an entry cannot trigger the removal of another entry * since entries are always added to the head of the list. */ hlist_for_each_entry_safe(mdb_entry, tmp, &vxlan->mdb_list, mdb_node) { if (desc->src_vni && desc->src_vni != mdb_entry->key.vni) continue; vxlan_mdb_remotes_flush(vxlan, mdb_entry, desc); /* Entry will only be removed if its remotes list is empty. */ vxlan_mdb_entry_put(vxlan, mdb_entry); } } int vxlan_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_mdb_flush_desc desc = {}; int err; ASSERT_RTNL(); err = vxlan_mdb_flush_desc_init(vxlan, &desc, tb, extack); if (err) return err; vxlan_mdb_flush(vxlan, &desc); return 0; } static const struct nla_policy vxlan_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_SRC_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range), }; static int vxlan_mdb_get_parse(struct net_device *dev, struct nlattr *tb[], struct vxlan_mdb_entry_key *group, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; struct vxlan_dev *vxlan = netdev_priv(dev); int err; memset(group, 0, sizeof(*group)); group->vni = vxlan->default_dst.remote_vni; if (!tb[MDBA_GET_ENTRY_ATTRS]) { vxlan_mdb_group_set(group, entry, NULL); return 0; } err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_GET_ENTRY_ATTRS], vxlan_mdbe_attrs_get_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_SOURCE] && !vxlan_mdb_is_valid_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; vxlan_mdb_group_set(group, entry, mdbe_attrs[MDBE_ATTR_SOURCE]); if (mdbe_attrs[MDBE_ATTR_SRC_VNI]) group->vni = cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_SRC_VNI])); return 0; } static struct sk_buff * vxlan_mdb_get_reply_alloc(const struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry) { struct vxlan_mdb_remote *remote; size_t nlmsg_size; nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0); list_for_each_entry(remote, &mdb_entry->remotes, list) nlmsg_size += vxlan_mdb_nlmsg_remote_size(vxlan, mdb_entry, remote); return nlmsg_new(nlmsg_size, GFP_KERNEL); } static int vxlan_mdb_get_reply_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb, const struct vxlan_mdb_entry *mdb_entry, u32 portid, u32 seq) { struct nlattr *mdb_nest, *mdb_entry_nest; struct vxlan_mdb_remote *remote; struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = vxlan->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) { err = -EMSGSIZE; goto cancel; } mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) { err = -EMSGSIZE; goto cancel; } list_for_each_entry(remote, &mdb_entry->remotes, list) { err = vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote); if (err) goto cancel; } nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return err; } int vxlan_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_mdb_entry *mdb_entry; struct vxlan_mdb_entry_key group; struct sk_buff *skb; int err; ASSERT_RTNL(); err = vxlan_mdb_get_parse(dev, tb, &group, extack); if (err) return err; mdb_entry = vxlan_mdb_entry_lookup(vxlan, &group); if (!mdb_entry) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); return -ENOENT; } skb = vxlan_mdb_get_reply_alloc(vxlan, mdb_entry); if (!skb) return -ENOMEM; err = vxlan_mdb_get_reply_fill(vxlan, skb, mdb_entry, portid, seq); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply"); goto free; } return rtnl_unicast(skb, dev_net(dev), portid); free: kfree_skb(skb); return err; } struct vxlan_mdb_entry *vxlan_mdb_entry_skb_get(struct vxlan_dev *vxlan, struct sk_buff *skb, __be32 src_vni) { struct vxlan_mdb_entry *mdb_entry; struct vxlan_mdb_entry_key group; if (!is_multicast_ether_addr(eth_hdr(skb)->h_dest) || is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) return NULL; /* When not in collect metadata mode, 'src_vni' is zero, but MDB * entries are stored with the VNI of the VXLAN device. */ if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) src_vni = vxlan->default_dst.remote_vni; memset(&group, 0, sizeof(group)); group.vni = src_vni; switch (skb->protocol) { case htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(struct iphdr))) return NULL; group.dst.sa.sa_family = AF_INET; group.dst.sin.sin_addr.s_addr = ip_hdr(skb)->daddr; group.src.sa.sa_family = AF_INET; group.src.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return NULL; group.dst.sa.sa_family = AF_INET6; group.dst.sin6.sin6_addr = ipv6_hdr(skb)->daddr; group.src.sa.sa_family = AF_INET6; group.src.sin6.sin6_addr = ipv6_hdr(skb)->saddr; break; #endif default: return NULL; } mdb_entry = vxlan_mdb_entry_lookup(vxlan, &group); if (mdb_entry) return mdb_entry; memset(&group.src, 0, sizeof(group.src)); mdb_entry = vxlan_mdb_entry_lookup(vxlan, &group); if (mdb_entry) return mdb_entry; /* No (S, G) or (*, G) found. Look up the all-zeros entry, but only if * the destination IP address is not link-local multicast since we want * to transmit such traffic together with broadcast and unknown unicast * traffic. */ switch (skb->protocol) { case htons(ETH_P_IP): if (ipv4_is_local_multicast(group.dst.sin.sin_addr.s_addr)) return NULL; group.dst.sin.sin_addr.s_addr = 0; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (ipv6_addr_type(&group.dst.sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) return NULL; memset(&group.dst.sin6.sin6_addr, 0, sizeof(group.dst.sin6.sin6_addr)); break; #endif default: return NULL; } return vxlan_mdb_entry_lookup(vxlan, &group); } netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, struct sk_buff *skb) { struct vxlan_mdb_remote *remote, *fremote = NULL; __be32 src_vni = mdb_entry->key.vni; list_for_each_entry_rcu(remote, &mdb_entry->remotes, list) { struct sk_buff *skb1; if ((vxlan_mdb_is_star_g(&mdb_entry->key) && READ_ONCE(remote->filter_mode) == MCAST_INCLUDE) || (READ_ONCE(remote->flags) & VXLAN_MDB_REMOTE_F_BLOCKED)) continue; if (!fremote) { fremote = remote; continue; } skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) vxlan_xmit_one(skb1, vxlan->dev, src_vni, rcu_dereference(remote->rd), false); } if (fremote) vxlan_xmit_one(skb, vxlan->dev, src_vni, rcu_dereference(fremote->rd), false); else kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } static void vxlan_mdb_check_empty(void *ptr, void *arg) { WARN_ON_ONCE(1); } int vxlan_mdb_init(struct vxlan_dev *vxlan) { int err; err = rhashtable_init(&vxlan->mdb_tbl, &vxlan_mdb_rht_params); if (err) return err; INIT_HLIST_HEAD(&vxlan->mdb_list); return 0; } void vxlan_mdb_fini(struct vxlan_dev *vxlan) { struct vxlan_mdb_flush_desc desc = {}; vxlan_mdb_flush(vxlan, &desc); WARN_ON_ONCE(vxlan->cfg.flags & VXLAN_F_MDB); rhashtable_free_and_destroy(&vxlan->mdb_tbl, vxlan_mdb_check_empty, NULL); }
7 7 7 7 49 49 49 18 8 347 59 293 341 7 327 14 63 53 20 15 49 49 23 45 59 59 2 3 29 29 21 18 20 39 360 7 355 7 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" /* * Convert from filesystem to in-memory representation. */ static struct posix_acl * ext4_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(ext4_acl_header)) return ERR_PTR(-EINVAL); if (((ext4_acl_header *)value)->a_version != cpu_to_le32(EXT4_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(ext4_acl_header); count = ext4_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void * ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) { ext4_acl_header *ext_acl; char *e; size_t n; *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(ext4_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(ext4_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); } if (retval > 0) acl = ext4_acl_from_disk(value, retval); else if (retval == -ENODATA || retval == -ENOSYS) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Set the access or default ACL of an inode. * * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = ext4_xattr_set_handle(handle, inode, name_index, "", value, size, xattr_flags); kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; } /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * * dir->i_rwsem: down * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; }
299 573 456 512 476 336 48 162 476 335 48 162 166 50 161 61 1 67 11 10 61 1 53 3 5 165 161 45 157 431 57 262 287 426 9 3 6 116 37 105 314 288 28 19 395 17 292 292 414 18 88 88 450 426 426 425 426 450 509 502 387 96 270 42 99 276 441 509 508 398 398 398 428 429 429 429 508 509 449 275 509 246 355 354 190 6 1 8 8 1 9 180 101 1 155 3 3 51 37 38 8 18 18 316 243 484 485 329 337 321 362 327 3 67 92 64 54 25 15 15 14 1 1 15 299 26 427 427 437 437 436 436 437 26 437 26 425 424 421 4 424 183 96 96 96 96 298 252 84 83 299 299 299 298 299 251 214 213 214 214 74 2 363 347 348 223 147 96 280 348 348 279 96 48 48 205 204 159 467 48 48 39 39 467 467 468 468 468 429 48 96 96 96 84 455 251 429 429 48 187 184 3 186 56 48 6 10 48 356 356 1 153 271 278 279 176 176 270 176 158 158 279 211 16 84 97 516 36 187 97 288 207 57 516 435 57 218 97 452 453 5 450 289 450 229 228 453 425 187 169 213 243 243 213 5 435 406 438 438 79 79 3 76 257 257 257 257 1 1 1 13 13 20 1 1 508 343 356 257 175 175 176 166 11 11 68 106 14 334 333 334 199 157 27 157 160 57 123 6 440 185 278 185 2 27 74 7 48 48 48 48 48 48 48 12 48 48 48 3 7 7 7 7 43 48 7 7 7 15 15 15 15 14 15 6 7 7 15 15 15 15 357 357 356 355 356 336 336 203 233 1 3 236 28 209 236 171 98 97 171 97 97 97 236 236 356 357 440 221 338 338 337 439 221 4 334 4 338 26 26 26 26 3 3 3 149 149 149 149 142 5 1 2 139 141 3 3 3 3 3 3 1 10 10 10 10 10 10 11 11 275 275 71 45 4 284 169 71 71 271 167 338 98 98 431 168 279 338 98 45 423 449 448 60 457 459 449 449 4 65 57 67 67 2 65 4 65 9 2 65 67 65 2 67 67 67 10 10 10 10 9 10 7 3 7 7 7 7 2 10 10 10 10 548 499 473 358 1 130 243 357 207 245 117 208 202 63 156 351 1 349 217 108 56 416 39 379 492 491 491 492 491 492 85 30 430 57 1 57 57 57 57 56 53 53 7 48 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 // SPDX-License-Identifier: GPL-2.0-only /* * Generic hugetlb support. * (C) Nadia Yvette Chambers, April 2004 */ #include <linux/list.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/minmax.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> #include <linux/padata.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/setup.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; /* * Due to ordering constraints across the init code for various * architectures, hugetlb hstate cmdline parameters can't simply * be early_param. early_param might call the setup function * before valid hugetlb page sizes are determined, leading to * incorrect rejection of valid hugepagesz= options. * * So, record the parameters early and consume them whenever the * init code is ready for them, by calling hugetlb_parse_params(). */ /* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ #define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) struct hugetlb_cmdline { char *val; int (*setup)(char *val); }; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; static unsigned long hugepage_allocation_threads __initdata; static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata; static int hstate_cmdline_index __initdata; static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata; static int hugetlb_param_index __initdata; static __init int hugetlb_add_param(char *s, int (*setup)(char *val)); static __init void hugetlb_parse_params(void); #define hugetlb_early_param(str, func) \ static __init int func##args(char *s) \ { \ return hugetlb_add_param(s, func); \ } \ early_param(str, func##args) /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes __ro_after_init; struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) { if (folio_test_hugetlb_cma(folio)) { hugetlb_cma_free_folio(folio); return; } folio_put(folio); } static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) return false; if (spool->max_hpages != -1) return spool->used_hpages == 0; if (spool->min_hpages != -1) return spool->rsv_hpages == spool->min_hpages; return true; } static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) { spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); kfree(spool); } } struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages) { struct hugepage_subpool *spool; spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = max_hpages; spool->hstate = h; spool->min_hpages = min_hpages; if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { kfree(spool); return NULL; } spool->rsv_hpages = min_hpages; return spool; } void hugepage_put_subpool(struct hugepage_subpool *spool) { unsigned long flags; spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; unlock_or_release_subpool(spool, flags); } /* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; if (!spool) return ret; spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) spool->used_hpages += delta; else { ret = -ENOMEM; goto unlock_ret; } } /* minimum size accounting */ if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on * behalf of subpool. Return difference. */ ret = delta - spool->rsv_hpages; spool->rsv_hpages = 0; } else { ret = 0; /* reserves already accounted for */ spool->rsv_hpages -= delta; } } unlock_ret: spin_unlock_irq(&spool->lock); return ret; } /* * Subpool accounting for freeing and unreserving pages. * Return the number of global page reservations that must be dropped. * The return value may only be different than the passed value (delta) * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; unsigned long flags; if (!spool) return delta; spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; /* minimum size accounting */ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else ret = spool->rsv_hpages + delta - spool->min_hpages; spool->rsv_hpages += delta; if (spool->rsv_hpages > spool->min_hpages) spool->rsv_hpages = spool->min_hpages; } /* * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ unlock_or_release_subpool(spool, flags); return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) { return HUGETLBFS_SB(inode->i_sb)->spool; } static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); } /* * hugetlb vma_lock helper routines */ void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_read(&resv_map->rw_sema); } } void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_read(&resv_map->rw_sema); } } void hugetlb_vma_lock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_write(&resv_map->rw_sema); } } void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; return down_write_trylock(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); return down_write_trylock(&resv_map->rw_sema); } return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); lockdep_assert_held(&resv_map->rw_sema); } } void hugetlb_vma_lock_release(struct kref *kref) { struct hugetlb_vma_lock *vma_lock = container_of(kref, struct hugetlb_vma_lock, refs); kfree(vma_lock); } static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; /* * vma_lock structure may or not be released as a result of put, * it certainly will no longer be attached to vma so clear pointer. * Semaphore synchronizes access to vma_lock->vma field. */ vma_lock->vma = NULL; vma->vm_private_data = NULL; up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); } static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); /* no free for anon vmas, but still need to unlock */ up_write(&resv_map->rw_sema); } } static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* * Only present in sharable vmas. */ if (!vma || !__vma_shareable_lock(vma)) return; if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); __hugetlb_vma_unlock_write_put(vma_lock); } } static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) return; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) return; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { /* * If we can not allocate structure, then vma can not * participate in pmd sharing. This is only a possible * performance enhancement and memory saving issue. * However, the lock is also used to synchronize page * faults with truncation. If the lock is not present, * unlikely races could leave pages in a file past i_size * until the file is removed. Warn in the unlikely case of * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; } /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); nrg->from = from; nrg->to = to; return nrg; } static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB nrg->reservation_counter = rg->reservation_counter; nrg->css = rg->css; if (rg->css) css_get(rg->css); #endif } /* Helper that records hugetlb_cgroup uncharge info. */ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, struct hstate *h, struct resv_map *resv, struct file_region *nrg) { #ifdef CONFIG_CGROUP_HUGETLB if (h_cg) { nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; /* * The caller will hold exactly one h_cg->css reference for the * whole contiguous reservation region. But this area might be * scattered when there are already some file_regions reside in * it. As a result, many file_regions may share only one css * reference. In order to ensure that one file_region must hold * exactly one h_cg->css reference, we should do css_get for * each file_region and leave the reference held by caller * untouched. */ css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in * a resv_map. */ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); } else { nrg->reservation_counter = NULL; nrg->css = NULL; } #endif } static void put_uncharge_info(struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB if (rg->css) css_put(rg->css); #endif } static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else return true; #endif } static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && has_same_uncharge_info(prg, rg)) { prg->to = rg->to; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); rg = prg; } nrg = list_next_entry(rg, link); if (&nrg->link != &resv->regions && nrg->from == rg->to && has_same_uncharge_info(nrg, rg)) { nrg->from = rg->from; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); } } static inline long hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { struct file_region *nrg; if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else *regions_needed += 1; return to - from; } /* * Must be called with resv->lock held. * * Calling this with regions_needed != NULL will count the number of pages * to be added but will not modify the linked list. And regions_needed will * indicate the number of file_regions needed in the cache to carry out to add * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; struct file_region *iter, *trg = NULL; struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ if (iter->to > last_accounted_offset) last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ if (iter->from >= t) { rg = iter->link.prev; break; } /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ if (iter->from > last_accounted_offset) add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, iter->from, h, h_cg, regions_needed); last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ if (!rg) rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); return add; } /* Must be called with resv->lock acquired. Will drop lock to allocate entries. */ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. * * This is a while loop because when we drop the lock, some other call * to region_add or region_del may have consumed some region_entries, * so we keep looping here until we finally have enough entries for * (adds_in_progress + regions_needed). */ while (resv->region_cache_count < (resv->adds_in_progress + regions_needed)) { to_allocate = resv->adds_in_progress + regions_needed - resv->region_cache_count; /* At this point, we should have enough entries in the cache * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); spin_unlock(&resv->lock); for (i = 0; i < to_allocate; i++) { trg = kmalloc(sizeof(*trg), GFP_KERNEL); if (!trg) goto out_of_memory; list_add(&trg->link, &allocated_regions); } spin_lock(&resv->lock); list_splice(&allocated_regions, &resv->region_cache); resv->region_cache_count += to_allocate; } return 0; out_of_memory: list_for_each_entry_safe(rg, trg, &allocated_regions, link) { list_del(&rg->link); kfree(rg); } return -ENOMEM; } /* * Add the huge page range represented by [f, t) to the reserve * map. Regions will be taken from the cache to fill in this range. * Sufficient regions should exist in the cache due to the previous * call to region_chg with the same range, but in some cases the cache will not * have sufficient entries due to races with other code doing region_add or * region_del. The extra needed entries will be allocated. * * regions_needed is the out value provided by a previous call to region_chg. * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. */ static long region_add(struct resv_map *resv, long f, long t, long in_regions_needed, struct hstate *h, struct hugetlb_cgroup *h_cg) { long add = 0, actual_regions_needed = 0; spin_lock(&resv->lock); retry: /* Count how many regions are actually needed to execute this add. */ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate * this add operation. Note that actual_regions_needed may be greater * than in_regions_needed, as the resv_map may have been modified since * the region_chg call. In this case, we need to make sure that we * allocate extra entries, such that we have enough for all the * existing adds_in_progress, plus the excess needed for this * operation. */ if (actual_regions_needed > in_regions_needed && resv->region_cache_count < resv->adds_in_progress + (actual_regions_needed - in_regions_needed)) { /* region_add operation of range 1 should never need to * allocate file_region entries. */ VM_BUG_ON(t - f <= 1); if (allocate_file_region_entries( resv, actual_regions_needed - in_regions_needed)) { return -ENOMEM; } goto retry; } add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); return add; } /* * Examine the existing reserve map and determine how many * huge pages in the specified range [f, t) are NOT currently * represented. This routine is called before a subsequent * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the * map. A number of new file_region structures is added to the cache as a * placeholder, for the subsequent region_add call to use. At least 1 * file_region structure is added. * * out_regions_needed is the number of regions added to the * resv->adds_in_progress. This value needs to be provided to a follow up call * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t, long *out_regions_needed) { long chg = 0; spin_lock(&resv->lock); /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; if (allocate_file_region_entries(resv, *out_regions_needed)) return -ENOMEM; resv->adds_in_progress += *out_regions_needed; spin_unlock(&resv->lock); return chg; } /* * Abort the in progress add operation. The adds_in_progress field * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort * is called to decrement the adds_in_progress counter. regions_needed * is the value returned by the region_chg call, it is used to decrement * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call. */ static void region_abort(struct resv_map *resv, long f, long t, long regions_needed) { spin_lock(&resv->lock); VM_BUG_ON(!resv->region_cache_count); resv->adds_in_progress -= regions_needed; spin_unlock(&resv->lock); } /* * Delete the specified range [f, t) from the reserve map. If the * t parameter is LONG_MAX, this indicates that ALL regions after f * should be deleted. Locate the regions which intersect [f, t) * and either trim, delete or split the existing regions. * * Returns the number of huge pages deleted from the reserve map. * In the normal case, the return value is zero or more. In the * case where a region must be split, a new region descriptor must * be allocated. If the allocation fails, -ENOMEM will be returned. * NOTE: If the parameter t == LONG_MAX, then we will never split * a region and possibly return -ENOMEM. Callers specifying * t == LONG_MAX do not need to check for -ENOMEM error. */ static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0; retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { /* * Skip regions before the range to be deleted. file_region * ranges are normally of the form [from, to). However, there * may be a "placeholder" entry in the map which is of the form * (from, to) with from == to. Check for placeholder entries * at the beginning of the range to be deleted. */ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; if (rg->from >= t) break; if (f > rg->from && t < rg->to) { /* Must split region */ /* * Check for an entry in the cache before dropping * lock and attempting allocation. */ if (!nrg && resv->region_cache_count > resv->adds_in_progress) { nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); resv->region_cache_count--; } if (!nrg) { spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) return -ENOMEM; goto retry; } del += t - f; hugetlb_cgroup_uncharge_file_region( resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; nrg->to = rg->to; copy_hugetlb_cgroup_uncharge_info(nrg, rg); INIT_LIST_HEAD(&nrg->link); /* Original entry is trimmed */ rg->to = f; list_add(&nrg->link, &rg->link); nrg = NULL; break; } if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; } if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - f, false); del += rg->to - f; rg->to = f; } } spin_unlock(&resv->lock); kfree(nrg); return del; } /* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed * and removed from the page cache. This routine will adjust the subpool * usage count, and the global reserve count if needed. By incrementing * these counts, the reserve map entry which could not be deleted will * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); if (!hugetlb_acct_memory(h, 1)) reserved = true; } else if (!rsv_adjust) { reserved = true; } if (!reserved) pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t). */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; long seg_to; if (rg->to <= f) continue; if (rg->from >= t) break; seg_from = max(rg->from, f); seg_to = min(rg->to, t); chg += seg_to - seg_from; } spin_unlock(&resv->lock); return chg; } /* * Convert the address within this vma to the page offset within * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> huge_page_shift(h)) + (vma->vm_pgoff >> huge_page_order(h)); } /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. * * Folios in this VMA will be aligned to, and at least the size of the * number of bytes returned by this function. * * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->pagesize) return vma->vm_ops->pagesize(vma); return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On * architectures where it differs, an architecture-specific 'strong' * version of this symbol is required. */ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. */ #define HPAGE_RESV_OWNER (1UL << 0) #define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. * * The private mapping reservation is represented in a subtly different * manner to a shared mapping. A shared mapping has a region map associated * with the underlying file, this region map represents the backing file * pages which have ever had a reservation assigned which this persists even * after the page is instantiated. A private mapping has a region map * associated with the original mmap which is attached to all VMAs which * reference it, this region map represents those offsets which have consumed * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { return (unsigned long)vma->vm_private_data; } static void set_vma_private_data(struct vm_area_struct *vma, unsigned long value) { vma->vm_private_data = (void *)value; } static void resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, struct hugetlb_cgroup *h_cg, struct hstate *h) { #ifdef CONFIG_CGROUP_HUGETLB if (!h_cg || !h) { resv_map->reservation_counter = NULL; resv_map->pages_per_hpage = 0; resv_map->css = NULL; } else { resv_map->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; resv_map->pages_per_hpage = pages_per_huge_page(h); resv_map->css = &h_cg->css; } #endif } struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); if (!resv_map || !rg) { kfree(resv_map); kfree(rg); return NULL; } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /* * Initialize these to 0. On shared mappings, 0's here indicate these * fields don't do cgroup accounting. On private mappings, these will be * re-initialized to the proper values, to indicate that hugetlb cgroup * reservations are to be un-charged from here. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); INIT_LIST_HEAD(&resv_map->region_cache); list_add(&rg->link, &resv_map->region_cache); resv_map->region_cache_count = 1; return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); struct list_head *head = &resv_map->region_cache; struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ region_del(resv_map, 0, LONG_MAX); /* ... and any entries left in the cache */ list_for_each_entry_safe(rg, trg, head, link) { list_del(&rg->link); kfree(rg); } VM_BUG_ON(resv_map->adds_in_progress); kfree(resv_map); } static inline struct resv_map *inode_resv_map(struct inode *inode) { /* * At inode evict time, i_mapping may not point to the original * address space within the inode. This original address space * contains the pointer to the resv_map. So, always use the * address space embedded within the inode. * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; return inode_resv_map(inode); } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && get_vma_private_data(vma) & ~HPAGE_RESV_MASK && is_vma_resv_set(vma, HPAGE_RESV_OWNER); } void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data * - For shared mappings this is a per-vma semaphore that may be * allocated in a subsequent call to hugetlb_vm_op_open. * Before clearing, make sure pointer is not associated with vma * as this will leak the structure. This is the case when called * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; } else vma->vm_private_data = NULL; } /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. * This function should be only used by move_vma() and operate on * same sized vma. It should never come here with last ref on the * reservation. */ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { /* * Clear the old hugetlb private page reservation. * It has already been transferred to new_vma. * * During a mremap() operation of a hugetlb vma we call move_vma() * which copies vma into new_vma and unmaps vma. After the copy * operation both new_vma and vma share a reference to the resv_map * struct, and at that point vma is about to be unmapped. We don't * want to return the reservation to the pool at unmap of vma because * the reservation still lives on in new_vma, so simply decrement the * ref here and remove the resv_map reference from this vma. */ struct resv_map *reservations = vma_resv_map(vma); if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_put_hugetlb_cgroup_uncharge_info(reservations); kref_put(&reservations->refs, resv_map_release); } hugetlb_dup_vma_private(vma); } static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; folio_set_hugetlb_freed(folio); } static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid) { struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { if (pin && !folio_is_longterm_pinnable(folio)) continue; if (folio_test_hwpoison(folio)) continue; if (is_migrate_isolate_page(&folio->page)) continue; list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return folio; } return NULL; } static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; int node = NUMA_NO_NODE; /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */ if (nid == NUMA_NO_NODE) nid = numa_node_id(); zonelist = node_zonelist(nid, gfp_mask); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; /* * no need to ask again on the same node. Pool is node rather than * zone aware */ if (zone_to_nid(zone) == node) continue; node = zone_to_nid(zone); folio = dequeue_hugetlb_folio_node_exact(h, node); if (folio) return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return NULL; } static unsigned long available_huge_pages(struct hstate *h) { return h->free_huge_pages - h->resv_huge_pages; } static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; int nid; /* * gbl_chg==1 means the allocation requires a new page that was not * reserved before. Making sure there's at least one free page. */ if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; err: return NULL; } /* * common helper functions for hstate_next_node_to_{alloc|free}. * We may have allocated or freed a huge page based on a different * nodes_allowed previously, so h->next_node_to_{alloc|free} might * be outside of *nodes_allowed. Ensure that we use an allowed * node for alloc or free. */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; } static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) { if (!node_isset(nid, *nodes_allowed)) nid = next_node_allowed(nid, nodes_allowed); return nid; } /* * returns the previously saved node ["this node"] from which to * allocate a persistent huge page for the pool and advance the * next node from which to allocate, handling wrap at end of node * mask. */ static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(*next_node, nodes_allowed); *next_node = next_node_allowed(nid, nodes_allowed); return nid; } /* * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. */ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); return nid; } #define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; int order = huge_page_order(h); bool retried = false; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); if (!folio) return NULL; } if (folio_ref_freeze(folio, 1)) return folio; pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); hugetlb_free_folio(folio); if (!retried) { retried = true; goto retry; } return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* * Remove hugetlb folio from lists. * If vmemmap exists for the folio, clear the hugetlb flag so that the * folio appears as just a compound page. Otherwise, wait until after * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held. */ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } if (adjust_surplus) { h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } /* * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) __folio_clear_hugetlb(folio); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; if (adjust_surplus) { h->surplus_huge_pages++; h->surplus_huge_pages_node[nid]++; } __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above * folio_change_private(folio, NULL) cleared it. */ folio_set_hugetlb_vmemmap_optimized(folio); arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; /* * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; /* * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the * page and put the page back on the hugetlb free list and treat * as a surplus page. */ add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } /* * If vmemmap pages were allocated above, then we need to clear the * hugetlb flag under the hugetlb lock. */ if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } /* * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); folio_ref_unfreeze(folio, 1); hugetlb_free_folio(folio); } /* * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. * * free_hpage_workfn() locklessly retrieves the linked list of pages to be * freed and frees them one-by-one. As the page->mapping pointer is going * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node * structure of a lockless linked list of huge pages to be freed. */ static LLIST_HEAD(hpage_freelist); static void free_hpage_workfn(struct work_struct *work) { struct llist_node *node; node = llist_del_all(&hpage_freelist); while (node) { struct folio *folio; struct hstate *h; folio = container_of((struct address_space **)node, struct folio, mapping); node = node->next; folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ h = size_to_hstate(folio_size(folio)); __update_and_free_hugetlb_folio(h, folio); cond_resched(); } } static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { __update_and_free_hugetlb_folio(h, folio); return; } /* * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. * * Only call schedule_work() if hpage_freelist is previously * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void bulk_vmemmap_restore_error(struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; if (!list_empty(non_hvo_folios)) { /* * Free any restored hugetlb pages so that restore of the * entire list can be retried. * The idea is that in the common case of ENOMEM errors freeing * hugetlb pages with vmemmap we will free up memory so that we * can allocate vmemmap for more hugetlb pages. */ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } else { /* * In the case where there are no folios which can be * immediately freed, we loop through the list trying to restore * vmemmap individually in the hope that someone elsewhere may * have done something to cause success (such as freeing some * memory). If unable to restore a hugetlb page, the hugetlb * page is made a surplus page and removed from the list. * If are able to restore vmemmap and free one hugetlb page, we * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); break; } } } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *folio_list) { long ret; struct folio *folio, *t_folio; LIST_HEAD(non_hvo_folios); /* * First allocate required vmemmmap (if necessary) for all folios. * Carefully handle errors and free up any available hugetlb pages * in an effort to make forward progress. */ retry: ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); if (ret < 0) { bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); goto retry; } /* * At this point, list should be empty, ret should be >= 0 and there * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; for_each_hstate(h) { if (huge_page_size(h) == size) return h; } return NULL; } void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); hugetlb_set_folio_subpool(folio, NULL); if (folio_test_anon(folio)) __ClearPageAnonExclusive(&folio->page); folio->mapping = NULL; restore_reserve = folio_test_hugetlb_restore_reserve(folio); folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { /* * A return code of zero implies that the subpool will be * under its minimum size if the reservation is not restored * after page is free. Therefore, force restore_reserve * operation. */ if (hugepage_subpool_put_pages(spool, 1) == 0) restore_reserve = true; } spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); hugetlb_cgroup_uncharge_folio(hstate_index(h), pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h)); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } /* * Must be called with the hugetlb lock held */ static void __prep_account_new_huge_page(struct hstate *h, int nid) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; } static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } /* * Find and lock address space (mapping) in write mode. * * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; if (i_mmap_trylock_write(mapping)) return mapping; return NULL; } static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* * By default we always try hard to allocate the folio with * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: folio = __folio_alloc(gfp_mask, order, nid, nmask); /* Ensure hugetlb folio won't have large_rmappable flag set. */ if (folio) folio_clear_large_rmappable(folio); if (folio && !folio_ref_freeze(folio, 1)) { folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ pr_warn("HugeTLB unexpected inflated folio ref count\n"); folio = NULL; } /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a * folio this indicates an overall state change. Clear bit so * that we resume normal 'try hard' allocations. */ if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); return folio; } static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; if (hstate_is_gigantic(h)) folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; } /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages * * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } static void prep_and_add_allocated_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Allocates a fresh hugetlb page in a node interleaved manner. The page * will later be added to the appropriate hugetlb pool. */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry, int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) return folio; } return NULL; } /* * Remove huge page from pool from next node to free. Attempt to keep * persistent huge pages more or less balanced over allowed nodes. * This routine only 'removes' the hugetlb page. The caller must make * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { folio = list_entry(h->hugepage_freelists[node].next, struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } return folio; } /* * Dissolve a given free hugetlb folio into free buddy pages. This function * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages * when the system is under memory pressure and the feature of * freeing unused vmemmap pages associated with each hugetlb page * is enabled. * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use * (allocated or reserved.) * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); bool adjust_surplus = false; if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); /* * Theoretically, we should return -EBUSY when we * encounter this race. In fact, we have a chance * to successfully dissolve the page if we do a * retry. Because the race window is quite small. * If we seize this opportunity, it is an optimization * for increasing the success rate of dissolving page. */ goto retry; } if (h->surplus_huge_pages_node[folio_nid(folio)]) adjust_surplus = true; remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* * Normally update_and_free_hugtlb_folio will allocate required vmemmmap * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. * * The folio_test_hugetlb check here is because * remove_hugetlb_folio will clear hugetlb folio flag for * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } } else rc = 0; update_and_free_hugetlb_folio(h, folio, false); return rc; } out: spin_unlock_irq(&hugetlb_lock); return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. * Also note that if dissolve_free_hugetlb_folio() returns with an error, all * free hugetlb folios that were dissolved before that error are lost. */ int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; if (!hugepages_supported()) return rc; order = huge_page_order(&default_hstate); for_each_hstate(h) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { folio = pfn_folio(pfn); rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } return rc; } /* * Allocates a fresh surplus page from the page allocator. */ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; spin_unlock_irq(&hugetlb_lock); folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; hugetlb_vmemmap_optimize_folio(h, folio); spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ __prep_account_new_huge_page(h, folio_nid(folio)); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); free_huge_folio(folio); return NULL; } h->surplus_huge_pages++; h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); return folio; } static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) return NULL; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ folio_set_hugetlb_temporary(folio); return folio; } /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; } struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { struct folio *folio; spin_lock_irq(&hugetlb_lock); folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { VM_BUG_ON(!h->resv_huge_pages); h->resv_huge_pages--; } spin_unlock_irq(&hugetlb_lock); return folio; } /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { struct folio *folio; folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); return folio; } } spin_unlock_irq(&hugetlb_lock); /* We cannot fallback to other nodes, as we could break the per-node pool. */ if (!allow_alloc_fallback) gfp_mask |= __GFP_THISNODE; return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA struct mempolicy *mpol = get_task_policy(current); /* * Only enforce MPOL_BIND policy which overlaps with cpuset policy * (from policy_nodemask) specifically for hugetlb case */ if (mpol->mode == MPOL_BIND && (apply_policy_zone(mpol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) return &mpol->nodes; #endif return NULL; } /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); struct folio *folio, *tmp; int ret; long i; long needed, allocated; bool alloc_ok = true; int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); if (mbind_nodemask) nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); else alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } allocated = 0; ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; /* Prioritize current node */ if (node_isset(numa_mem_id(), alloc_nodemask)) folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), numa_mem_id(), NULL); if (!folio) { for_each_node_mask(node, alloc_nodemask) { if (node == numa_mem_id()) continue; folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), node, NULL); if (folio) break; } } if (!folio) { alloc_ok = false; break; } list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; /* * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { if (alloc_ok) goto retry; /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ goto free; } /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but * before they are reserved. */ needed += allocated; h->resv_huge_pages += delta; ret = 0; /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. * Pages have no ref count, call free_huge_folio directly. */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; } /* * This routine has two main purposes: * 1) Decrement the reservation count (resv_huge_pages) by the value passed * in unused_resv_pages. This corresponds to the prior adjustments made * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* * Part (or even all) of the reservation could have been backed * by pre-allocated pages. Only free surplus pages. */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* * We want to release as many surplus pages as possible, spread * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { struct folio *folio; folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); if (!folio) goto out; list_add(&folio->lru, &page_list); } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * vma_needs_reservation, vma_commit_reservation and vma_end_reservation * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called * to add the page to the reservation map. If the page allocation fails, * the reservation must be ended instead of committed. vma_end_reservation * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. * * vma_add_reservation is used in error paths where a reservation must * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. * * vma_del_reservation is used in error paths where an entry in the reserve * map was created during huge page allocation and must be removed. It is to * be called after calling vma_needs_reservation to determine if a reservation * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; long ret; long dummy_out_regions_needed; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); switch (mode) { case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); /* We assume that vma_reservation_* routines always operate on * 1 page, and that adding to resv map a 1 page entry can only * ever require 1 region. */ VM_BUG_ON(dummy_out_regions_needed != 1); break; case VMA_COMMIT_RESV: ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); break; case VMA_END_RESV: region_abort(resv, idx, idx + 1, 1); ret = 0; break; case VMA_ADD_RESV: if (vma->vm_flags & VM_MAYSHARE) { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } else { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } break; case VMA_DEL_RESV: if (vma->vm_flags & VM_MAYSHARE) { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } else { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } break; default: BUG(); } if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. * * In most cases, reserves always exist for private mappings. * However, a file associated with mapping could have been * hole punched or truncated after reserves were consumed. * As subsequent fault on such a range will not use reserves. * Subtle - The reserve map for private mappings has the * opposite meaning than that of shared mappings. If NO * entry is in the reserve map, it means a reservation exists. * If an entry exists in the reserve map, it means the * reservation has already been consumed. As a result, the * return value of this routine is the opposite of the * value returned from reserve map manipulation routines above. */ if (ret > 0) return 0; if (ret == 0) return 1; return ret; } static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); } static void vma_end_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } static long vma_add_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } static long vma_del_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); } /* * This routine is called to restore reservation information on error paths. * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the folio consumed the reservation. * hugetlb_restore_reserve is set in the folio. * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_folio later in the error path will increment the * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio) { long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else vma_end_reservation(h, vma, address); } else { if (!rc) { /* * This indicates there is an entry in the reserve map * not added by alloc_hugetlb_folio. We know it was added * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ rc = vma_del_reservation(h, vma, address); if (rc < 0) /* * VERY rare out of memory condition. Since * we can not delete the entry, set * hugetlb_restore_reserve so that the reserve * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from * vma_needs_reservation call. Memory allocation is * only attempted if a new entry is needed. Therefore, * this implies there is not an entry in the * reserve map. * * For shared mappings, no entry in the map indicates * no reservation. We are done. */ if (!(vma->vm_flags & VM_MAYSHARE)) /* * For private mappings, no entry indicates * a reservation is present. Since we can * not add an entry, set hugetlb_restore_reserve * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing */ vma_end_reservation(h, vma, address); } } /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_folio too. */ goto free_new; } else if (folio_ref_count(old_folio)) { bool isolated; /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ spin_unlock_irq(&hugetlb_lock); cond_resched(); goto retry; } else { if (!new_folio) { spin_unlock_irq(&hugetlb_lock); new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) return -ENOMEM; __prep_new_hugetlb_folio(h, new_folio); goto retry; } /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ remove_hugetlb_folio(h, old_folio, false); /* * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); enqueue_hugetlb_folio(h, new_folio); /* * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); if (new_folio) update_and_free_hugetlb_folio(h, new_folio, false); return ret; } int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; struct folio *folio = page_folio(page); int ret = -EBUSY; /* * The page might have been dissolved from under our feet, so make sure * to carefully check the state under the lock. * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; } spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ if (hstate_is_gigantic(h)) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } /* * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn * range with new folios. * @start_pfn: start pfn of the given pfn range * @end_pfn: end pfn of the given pfn range * Returns 0 on success, otherwise negated error. */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { struct hstate *h; struct folio *folio; int ret = 0; LIST_HEAD(isolate_list); while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { start_pfn++; continue; } if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, &isolate_list); if (ret) break; putback_movable_pages(&isolate_list); } start_pfn++; } return ret; } void wait_for_freed_hugetlb_folios(void) { if (llist_empty(&hpage_freelist)) return; flush_work(&free_hpage_work); } typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv * count either can be reused (0), or an extra needed (1). */ MAP_CHG_REUSE = 0, MAP_CHG_NEEDED = 1, /* * Cannot use per-vma resv count can be used, hence a new resv * count is enforced. * * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the * transaction. Hence it needs to differenciate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; /* * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW * faults of hugetlb private mappings on top of a non-page-cache folio (in * which case even if there's a private vma resv map it won't cover such * allocation). New call sites should (probably) never set it to true!! * When it's set, the allocation will bypass all vma level reservations. */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; long retval, gbl_chg; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); /* Whether we need a separate per-vma reservation? */ if (cow_from_owner) { /* * Special case! Since it's a CoW on top of a reserved * page, the private resv map doesn't count. So it cannot * consume the per-vma resv map even if it's reserved. */ map_chg = MAP_CHG_ENFORCED; } else { /* * Examine the region/reserve map to determine if the process * has a reservation for the page to be allocated. A return * code of zero indicates a reservation exists (no change). */ retval = vma_needs_reservation(h, vma, addr); if (retval < 0) return ERR_PTR(-ENOMEM); map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* * Whether we need a separate global reservation? * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. * Or if it can get one from the pool reservation directly. */ if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; } else { /* * If we have the vma reservation ready, no need for extra * global reservation. */ gbl_chg = 0; } /* * If this allocation is not consuming a per-vma reservation, * charge the hugetlb cgroup now. */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_uncharge_cgroup_reservation; spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } /* * Either dequeued or buddy-allocated folio needs to add special * mark to the folio when it consumes a global reservation. */ if (!gbl_chg) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } spin_unlock_irq(&hugetlb_lock); hugetlb_set_folio_subpool(folio, spool); if (map_chg != MAP_CHG_ENFORCED) { /* commit() is only needed if the map_chg is not enforced */ retval = vma_commit_reservation(h, vma, addr); /* * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { long rsv_adjust; rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (map_chg) { spin_lock_irq(&hugetlb_lock); hugetlb_cgroup_uncharge_folio_rsvd( hstate_index(h), pages_per_huge_page(h), folio); spin_unlock_irq(&hugetlb_lock); } } } ret = mem_cgroup_charge_hugetlb(folio, gfp); /* * Unconditionally increment NR_HUGETLB here. If it turns out that * mem_cgroup_charge_hugetlb failed, then immediately free the page and * decrement NR_HUGETLB. */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); if (ret == -ENOMEM) { free_huge_folio(folio); return ERR_PTR(-ENOMEM); } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) { struct huge_bootmem_page *m; int listnode = nid; if (hugetlb_early_cma(h)) m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact); else { if (node_exact) m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); else { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); /* * For pre-HVO to work correctly, pages need to be on * the list for the node they were actually allocated * from. That node may be different in the case of * fallback by memblock_alloc_try_nid_raw. So, * extract the actual node first. */ if (m) listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m))); } if (m) { m->flags = 0; m->cma = NULL; } } if (m) { /* * Use the beginning of the huge page to store the * huge_bootmem_page struct (until gather_bootmem * puts them into the mem_map). * * Put them into a private list first because mem_map * is not up yet. */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages[listnode]); m->hstate = h; } return m; } int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = alloc_bootmem(h, node, true); if (!m) return 0; goto found; } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { m = alloc_bootmem(h, node, false); if (!m) return 0; goto found; } found: /* * Only initialize the head struct page in memmap_init_reserved_pages, * rest of the struct pages will be initialized by the HugeTLB * subsystem itself. * The head struct page is used to get folio information by the HugeTLB * subsystem like zone id and node id. */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); return 1; } /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; int ret; for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); VM_BUG_ON(!ret); } } static void __init hugetlb_folio_init_vmemmap(struct folio *folio, struct hstate *h, unsigned long nr_pages) { int ret; /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_HVO; } static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_CMA; } /* * memblock-allocated pageblocks might not have the migrate type set * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE) * here, or MIGRATE_CMA if this was a page allocated through an early CMA * reservation. * * In case of vmemmap optimized folios, the tail vmemmap pages are mapped * read-only, but that's ok - for sparse vmemmap this does not write to * the page structure. */ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, struct hstate *h) { unsigned long nr_pages = pages_per_huge_page(h), i; WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio))); for (i = 0; i < nr_pages; i += pageblock_nr_pages) { if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else set_pageblock_migratetype(folio_page(folio, i), MIGRATE_MOVABLE); } } static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* * If HVO fails, initialize all tail struct pages * We do not worry about potential long lock hold * time as this is early in boot and there should * be no contention. */ hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } bool __init hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m) { unsigned long start_pfn; bool valid; if (m->flags & HUGE_BOOTMEM_ZONES_VALID) { /* * Already validated, skip check. */ return true; } if (hugetlb_bootmem_page_earlycma(m)) { valid = cma_validate_zones(m->cma); goto out; } start_pfn = virt_to_phys(m) >> PAGE_SHIFT; valid = !pfn_range_intersects_zones(nid, start_pfn, pages_per_huge_page(m->hstate)); out: if (!valid) hstate_boot_nrinvalid[hstate_index(m->hstate)]++; return valid; } /* * Free a bootmem page that was found to be invalid (intersecting with * multiple zones). * * Since it intersects with multiple zones, we can't just do a free * operation on all pages at once, but instead have to walk all * pages, freeing them one by one. */ static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page, struct hstate *h) { unsigned long npages = pages_per_huge_page(h); unsigned long pfn; while (npages--) { pfn = page_to_pfn(page); __init_page_from_nid(pfn, nid); free_reserved_page(page); page++; } } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m, *tm; struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Can't use this page. Initialize the * page structures if that hasn't already * been done, and give them to the page * allocator. */ hugetlb_bootmem_free_invalid_page(nid, page, h); continue; } /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(h, folio); if (hugetlb_bootmem_page_prehvo(m)) /* * If pre-HVO was done, just set the * flag, the HVO code will then skip * this folio. */ folio_set_hugetlb_vmemmap_optimized(folio); if (hugetlb_bootmem_page_earlycma(m)) folio_set_hugetlb_cma(folio); list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. * * For CMA pages, this is done in init_cma_pageblock * (via hugetlb_bootmem_init_migratetype), so skip it here. */ if (!folio_test_hugetlb_cma(folio)) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } prep_and_add_bootmem_folios(h, &folio_list); } static void __init gather_bootmem_prealloc_parallel(unsigned long start, unsigned long end, void *arg) { int nid; for (nid = start; nid < end; nid++) gather_bootmem_prealloc_node(nid); } static void __init gather_bootmem_prealloc(void) { struct padata_mt_job job = { .thread_fn = gather_bootmem_prealloc_parallel, .fn_arg = NULL, .start = 0, .size = nr_node_ids, .align = 1, .min_chunk = 1, .max_threads = num_node_state(N_MEMORY), .numa_aware = true, }; padata_do_multithreaded(&job); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h, nid)) break; } else { struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); if (!folio) break; list_add(&folio->lru, &folio_list); } cond_resched(); } if (!list_empty(&folio_list)) prep_and_add_allocated_folios(h, &folio_list); if (i == h->max_huge_pages_node[nid]) return; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); h->max_huge_pages_node[nid] = i; } static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) { int i; bool node_specific_alloc = false; for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; } } return node_specific_alloc; } static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) { if (allocated < h->max_huge_pages) { char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, allocated); h->max_huge_pages = allocated; } } static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) { struct hstate *h = (struct hstate *)arg; int i, num = end - start; nodemask_t node_alloc_noretry; LIST_HEAD(folio_list); int next_node = first_online_node; /* Bit mask controlling how hard we retry per-node allocations.*/ nodes_clear(node_alloc_noretry); for (i = 0; i < num; ++i) { struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], &node_alloc_noretry, &next_node); if (!folio) break; list_move(&folio->lru, &folio_list); cond_resched(); } prep_and_add_allocated_folios(h, &folio_list); } static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) { unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; cond_resched(); } return i; } static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) { struct padata_mt_job job = { .fn_arg = h, .align = 1, .numa_aware = true }; unsigned long jiffies_start; unsigned long jiffies_end; job.thread_fn = hugetlb_pages_alloc_boot_node; job.start = 0; job.size = h->max_huge_pages; /* * job.max_threads is 25% of the available cpu threads by default. * * On large servers with terabytes of memory, huge page allocation * can consume a considerably amount of time. * * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. * 2MiB huge pages. Using more threads can significantly improve allocation time. * * +-----------------------+-------+-------+-------+-------+-------+ * | threads | 8 | 16 | 32 | 64 | 128 | * +-----------------------+-------+-------+-------+-------+-------+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | * +-----------------------+-------+-------+-------+-------+-------+ */ if (hugepage_allocation_threads == 0) { hugepage_allocation_threads = num_online_cpus() / 4; hugepage_allocation_threads = max(hugepage_allocation_threads, 1); } job.max_threads = hugepage_allocation_threads; job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; jiffies_start = jiffies; padata_do_multithreaded(&job); jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", jiffies_to_msecs(jiffies_end - jiffies_start), hugepage_allocation_threads); return h->nr_huge_pages; } /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. * - For gigantic pages, this is called early in the boot process and * pages are allocated from memblock allocated or something similar. * Gigantic pages are actually added to pools later with the routine * gather_bootmem_prealloc. * - For non-gigantic pages, this is called later in the boot process after * all of mm is up and functional. Pages are allocated from buddy and * then added to hugetlb pools. */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; /* * Skip gigantic hugepages allocation if early CMA * reservations are not available. */ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() && !hugetlb_early_cma(h)) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ if (hstate_is_gigantic(h)) allocated = hugetlb_gigantic_pages_alloc_boot(h); else allocated = hugetlb_pages_alloc_boot(h); hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) { struct hstate *h, *h2; for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); /* * Set demote order for each hstate. Note that * h->demote_order is initially 0. * - We can not demote gigantic pages if runtime freeing * is not supported, so skip this. * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) continue; if (h2->order < h->order && h2->order > h->demote_order) h->demote_order = h2->order; } } } static void __init report_hugepages(void) { struct hstate *h; unsigned long nrinvalid; for_each_hstate(h) { char buf[32]; nrinvalid = hstate_boot_nrinvalid[hstate_index(h)]; h->max_huge_pages -= nrinvalid; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", buf, nrinvalid, nrinvalid > 1 ? "s" : ""); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; /* * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; if (folio_test_highmem(folio)) continue; remove_hugetlb_folio(h, folio, false); list_add(&folio->lru, &page_list); } } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { } #endif /* * Increment or decrement surplus_huge_pages. Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { int nr_nodes, node; lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } } else { for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node] < h->nr_huge_pages_node[node]) goto found; } } return 0; found: h->surplus_huge_pages += delta; h->surplus_huge_pages_node[node] += delta; return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* * Bit mask controlling how hard we retry per-node allocations. * If we can not allocate the bit mask, do not attempt to allocate * the requested huge pages. */ if (node_alloc_noretry) nodes_clear(*node_alloc_noretry); else return -ENOMEM; /* * resize_lock mutex prevents concurrent adjustments to number of * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. * Changing node specific huge page count may require a corresponding * change to the global count. In any case, the passed node mask * (nodes_allowed) will restrict alloc/free to the specified node. */ if (nid != NUMA_NO_NODE) { unsigned long old_count = count; count += persistent_huge_pages(h) - (h->nr_huge_pages_node[nid] - h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted * to allocate as many huge pages as possible. Set count to * largest possible value to align with their intention. */ if (count < old_count) count = ULONG_MAX; } /* * Gigantic pages runtime allocation depend on the capability for large * page range allocation. * If the system does not provide this feature, return an error when * the user tries to allocate gigantic pages but let the user free the * boottime allocated gigantic pages. */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ } /* * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } allocated = 0; while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry, &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } list_add(&folio->lru, &page_list); allocated++; /* Bail for signals. Probably ctrl-c from user */ if (signal_pending(current)) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } spin_lock_irq(&hugetlb_lock); } /* Add allocated pages to the pool */ if (!list_empty(&page_list)) { spin_unlock_irq(&hugetlb_lock); prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * Decrease the pool size * First return free pages to the buddy allocator (being careful * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. * * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. * * min_count is the expected number of persistent pages, we * shouldn't calculate min_count by using * resv_huge_pages + persistent_huge_pages() - free_huge_pages, * because there may exist free surplus huge pages, and this will * lead to subtracting twice. Free surplus huge pages come from HVO * failing to restore vmemmap, see comments in the callers of * hugetlb_vmemmap_restore_folio(). Thus, we should calculate * persistent free count first. */ persistent_free_count = h->free_huge_pages; if (h->free_huge_pages > persistent_huge_pages(h)) { if (h->free_huge_pages > h->surplus_huge_pages) persistent_free_count -= h->surplus_huge_pages; else persistent_free_count = 0; } min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); /* * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return 0; } static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, struct list_head *src_list) { long rc; struct folio *folio, *next; LIST_HEAD(dst_list); LIST_HEAD(ret_list); rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ mutex_lock(&dst->resize_lock); list_for_each_entry_safe(folio, next, src_list, lru) { int i; bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; cma = folio_test_hugetlb_cma(folio); list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); /* Careful: see __split_huge_page_tail() */ struct folio *new_folio = (struct folio *)page; clear_compound_head(page); prep_compound_page(page, dst->order); new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); /* Copy the CMA flag so that it is freed correctly */ if (cma) folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } prep_and_add_allocated_folios(dst, &dst_list); mutex_unlock(&dst->resize_lock); return rc; } static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; struct hstate *dst; long rc = 0; long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } dst = size_to_hstate(PAGE_SIZE << src->demote_order); for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { LIST_HEAD(list); struct folio *folio, *next; list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; remove_hugetlb_folio(src, folio, false); list_add(&folio->lru, &list); if (++nr_demoted == nr_to_demote) break; } spin_unlock_irq(&hugetlb_lock); rc = demote_free_hugetlb_folios(src, dst, &list); spin_lock_irq(&hugetlb_lock); list_for_each_entry_safe(folio, next, &list, lru) { list_del(&folio->lru); add_hugetlb_folio(src, folio, false); nr_demoted--; } if (rc < 0 || nr_demoted == nr_to_demote) break; } /* * Not absolutely necessary, but for consistency update max_huge_pages * based on pool changes for the demoted page. */ src->max_huge_pages -= nr_demoted; dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); if (rc < 0) return rc; if (nr_demoted) return nr_demoted; /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ return -EBUSY; } #define HSTATE_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define HSTATE_ATTR_WO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_WO(_name) #define HSTATE_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (hstate_kobjs[i] == kobj) { if (nidp) *nidp = NUMA_NO_NODE; return &hstates[i]; } return kobj_to_node_hstate(kobj, nidp); } static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long nr_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) nr_huge_pages = h->nr_huge_pages; else nr_huge_pages = h->nr_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", nr_huge_pages); } static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && init_nodemask_of_mempolicy(&nodes_allowed))) n_mask = &node_states[N_MEMORY]; else n_mask = &nodes_allowed; } else { /* * Node specific request. count adjustment happens in * set_max_huge_pages() after acquiring hugetlb_lock. */ init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } err = set_max_huge_pages(h, count, nid, n_mask); return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, const char *buf, size_t len) { struct hstate *h; unsigned long count; int nid; int err; err = kstrtoul(buf, 10, &count); if (err) return err; h = kobj_to_hstate(kobj, &nid); return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); } static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); #ifdef CONFIG_NUMA /* * hstate attribute for optionally mempolicy-based constraint on persistent * huge page alloc/free. */ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); if (err) return err; spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; spin_unlock_irq(&hugetlb_lock); return count; } HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long free_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) free_huge_pages = h->free_huge_pages; else free_huge_pages = h->free_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long surplus_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) surplus_huge_pages = h->surplus_huge_pages; else surplus_huge_pages = h->surplus_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); static ssize_t demote_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { unsigned long nr_demote; unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; int err; int nid; err = kstrtoul(buf, 10, &nr_demote); if (err) return err; h = kobj_to_hstate(kobj, &nid); if (nid != NUMA_NO_NODE) { init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } else { n_mask = &node_states[N_MEMORY]; } /* Synchronize with other sysfs operations modifying huge pages */ mutex_lock(&h->resize_lock); spin_lock_irq(&hugetlb_lock); while (nr_demote) { long rc; /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. */ if (nid != NUMA_NO_NODE) nr_available = h->free_huge_pages_node[nid]; else nr_available = h->free_huge_pages; nr_available -= h->resv_huge_pages; if (!nr_available) break; rc = demote_pool_huge_page(h, n_mask, nr_demote); if (rc < 0) { err = rc; break; } nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); if (err) return err; return len; } HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); } static ssize_t demote_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; demote_size = (unsigned long)memparse(buf, NULL); demote_hstate = size_to_hstate(demote_size); if (!demote_hstate) return -EINVAL; demote_order = demote_hstate->order; if (demote_order < HUGETLB_PAGE_ORDER) return -EINVAL; /* demote order must be smaller than hstate order */ h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; /* resize_lock synchronizes access to demote size and writes */ mutex_lock(&h->resize_lock); h->demote_order = demote_order; mutex_unlock(&h->resize_lock); return count; } HSTATE_ATTR(demote_size); static struct attribute *hstate_attrs[] = { &nr_hugepages_attr.attr, &nr_overcommit_hugepages_attr.attr, &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, #ifdef CONFIG_NUMA &nr_hugepages_mempolicy_attr.attr, #endif NULL, }; static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static struct attribute *hstate_demote_attrs[] = { &demote_size_attr.attr, &demote_attr.attr, NULL, }; static const struct attribute_group hstate_demote_attr_group = { .attrs = hstate_demote_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } if (h->demote_order) { retval = sysfs_create_group(hstate_kobjs[hi], &hstate_demote_attr_group); if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } } return 0; } #ifdef CONFIG_NUMA static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, * with node devices in node_devices[] using a parallel array. The array * index of a node device or _hstate == node id. * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, &free_hugepages_attr.attr, &surplus_hugepages_attr.attr, NULL, }; static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; /* * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { int nid; for (nid = 0; nid < nr_node_ids; nid++) { struct node_hstate *nhs = &node_hstates[nid]; int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (nhs->hstate_kobjs[i] == kobj) { if (nidp) *nidp = nid; return &hstates[i]; } } BUG(); return NULL; } /* * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ for_each_hstate(h) { int idx = hstate_index(h); struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; if (!hstate_kobj) continue; if (h->demote_order) sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); kobject_put(hstate_kobj); nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; } /* * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (!hugetlb_sysfs_initialized) return; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", &node->dev.kobj); if (!nhs->hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; } } } /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have * registered their associated device by this time. */ static void __init hugetlb_register_all_nodes(void) { int nid; for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { BUG(); if (nidp) *nidp = -1; return NULL; } static void hugetlb_register_all_nodes(void) { } #endif static void __init hugetlb_sysfs_init(void) { struct hstate *h; int err; hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); if (!hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA hugetlb_sysfs_initialized = true; #endif hugetlb_register_all_nodes(); } #ifdef CONFIG_SYSCTL static void hugetlb_sysctl_init(void); #else static inline void hugetlb_sysctl_init(void) { } #endif static int __init hugetlb_init(void) { int i; BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; } /* * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some * architectures depend on setup being done here. */ hugetlb_add_hstate(HUGETLB_PAGE_ORDER); if (!parsed_default_hugepagesz) { /* * If we did not parse a default huge page size, set * default_hstate_idx to HPAGE_SIZE hstate. And, if the * number of huge pages for this default size was implicitly * specified, set that here as well. * Note that the implicit setting will overwrite an explicit * setting. A warning will be printed in this case. */ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); if (default_hstate_max_huge_pages) { if (default_hstate.max_huge_pages) { char buf[32]; string_get_size(huge_page_size(&default_hstate), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", default_hstate_max_huge_pages); } default_hstate.max_huge_pages = default_hstate_max_huge_pages; for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } } hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else num_fault_mutexes = 1; #endif hugetlb_fault_mutex_table = kmalloc_array(num_fault_mutexes, sizeof(struct mutex), GFP_KERNEL); BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } subsys_initcall(hugetlb_init); /* Overwritten by architectures with more huge page sizes */ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); parsed_hstate = h; } bool __init __weak hugetlb_node_alloc_supported(void) { return true; } static void __init hugepages_clear_pages_in_node(void) { if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, sizeof(parsed_hstate->max_huge_pages_node)); } } static __init int hugetlb_add_param(char *s, int (*setup)(char *)) { size_t len; char *p; if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; len = strlen(s) + 1; if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf)) return -EINVAL; p = &hstate_cmdline_buf[hstate_cmdline_index]; memcpy(p, s, len); hstate_cmdline_index += len; hugetlb_params[hugetlb_param_index].val = p; hugetlb_params[hugetlb_param_index].setup = setup; hugetlb_param_index++; return 0; } static __init void hugetlb_parse_params(void) { int i; struct hugetlb_cmdline *hcp; for (i = 0; i < hugetlb_param_index; i++) { hcp = &hugetlb_params[i]; hcp->setup(hcp->val); } hugetlb_cma_validate_params(); } /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz * specification. If not, ignore the hugepages value. hugepages can also * be the first huge page command line option in which case it implicitly * specifies the number of huge pages for the default size. */ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; int node = NUMA_NO_NODE; int count; unsigned long tmp; char *p = s; if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; return -EINVAL; } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter * yet, so this hugepages= parameter goes to the "default hstate". * Otherwise, it goes with the previously parsed hugepagesz or * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); return 1; } while (*p) { count = 0; if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; /* Parameter is node format */ if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 1; } if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; if (!hugetlb_max_hstate) default_hugepages_in_node[node] = tmp; else parsed_hstate->max_huge_pages_node[node] = tmp; *mhp += tmp; /* Go to parse next node*/ if (p[count] == ',') p += count + 1; else break; } else { if (p != s) goto invalid; *mhp = tmp; break; } } last_mhp = mhp; return 0; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); return -EINVAL; } hugetlb_early_param("hugepages", hugepages_setup); /* * hugepagesz command line processing * A specific huge page size can only be specified once with hugepagesz. * hugepagesz is followed by hugepages on the command line. The global * variable 'parsed_valid_hugepagesz' is used to determine if prior * hugepagesz argument was valid. */ static int __init hugepagesz_setup(char *s) { unsigned long size; struct hstate *h; parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); return -EINVAL; } h = size_to_hstate(size); if (h) { /* * hstate for this size already exists. This is normally * an error, but is allowed if the existing hstate is the * default hstate. More specifically, it is only allowed if * the number of huge pages for the default hstate was not * previously specified. */ if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); return -EINVAL; } /* * No need to call hugetlb_add_hstate() as hstate already * exists. But, do set parsed_hstate so that a following * hugepages= parameter will be applied to this hstate. */ parsed_hstate = h; parsed_valid_hugepagesz = true; return 0; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; return 0; } hugetlb_early_param("hugepagesz", hugepagesz_setup); /* * default_hugepagesz command line input * Only one instance of default_hugepagesz allowed on command line. */ static int __init default_hugepagesz_setup(char *s) { unsigned long size; int i; parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); return -EINVAL; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); return -EINVAL; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; parsed_default_hugepagesz = true; default_hstate_idx = hstate_index(size_to_hstate(size)); /* * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; /* * Since this is an early parameter, we can't check * NUMA node state yet, so loop through MAX_NUMNODES. */ for (i = 0; i < MAX_NUMNODES; i++) { if (default_hugepages_in_node[i] != 0) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } default_hstate_max_huge_pages = 0; } return 0; } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); static bool __hugetlb_bootmem_allocated __initdata; bool __init hugetlb_bootmem_allocated(void) { return __hugetlb_bootmem_allocated; } void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; if (__hugetlb_bootmem_allocated) return; for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); hugetlb_parse_params(); for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; h->next_nid_to_free = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } __hugetlb_bootmem_allocated = true; } /* * hugepage_alloc_threads command line parsing. * * When set, use this specific number of threads for the boot * allocation of hugepages. */ static int __init hugepage_alloc_threads_setup(char *s) { unsigned long allocation_threads; if (kstrtoul(s, 0, &allocation_threads) != 0) return 1; if (allocation_threads == 0) return 1; hugepage_allocation_threads = allocation_threads; return 1; } __setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup); static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } return nr; } #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos, unsigned long *out) { struct ctl_table dup_table; /* * In order to avoid races with __do_proc_doulongvec_minmax(), we * can duplicate the @table and alter the duplicate of it. */ dup_table = *table; dup_table.data = out; return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); } static int hugetlb_sysctl_handler_common(bool obey_mempolicy, const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) ret = __nr_hugepages_store_common(obey_mempolicy, h, NUMA_NO_NODE, tmp, *length); out: return ret; } static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, buffer, length, ppos); } #ifdef CONFIG_NUMA static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); } #endif /* CONFIG_NUMA */ static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; if (write && hstate_is_gigantic(h)) return -EINVAL; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) { spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock_irq(&hugetlb_lock); } out: return ret; } static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, }, #ifdef CONFIG_NUMA { .procname = "nr_hugepages_mempolicy", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, #endif { .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, }; static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } #endif /* CONFIG_SYSCTL */ void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; unsigned long total = 0; if (!hugepages_supported()) return; for_each_hstate(h) { unsigned long count = h->nr_huge_pages; total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %8lu kB\n", count, h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, huge_page_size(h) / SZ_1K); } seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) { struct hstate *h = &default_hstate; if (!hugepages_supported()) return 0; return sysfs_emit_at(buf, len, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", nid, h->nr_huge_pages_node[nid], nid, h->free_huge_pages_node[nid], nid, h->surplus_huge_pages_node[nid]); } void hugetlb_show_meminfo_node(int nid) { struct hstate *h; if (!hugepages_supported()) return; for_each_hstate(h) printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", nid, h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { struct hstate *h; unsigned long nr_total_pages = 0; for_each_hstate(h) nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; if (!delta) return 0; spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such * reservation is completely rubbish in the presence of cpuset because * the reservation is not checked against page availability for the * current cpuset. Application can still potentially OOM'ed by kernel * with lack of free htlb page in cpuset that the task is in. * Attempt to enforce strict accounting with cpuset is almost * impossible (or too ugly) because cpuset is too fluid that * task or memory node can be dynamically moved between cpusets. * * The change of semantics for shared hugetlb mapping with cpuset is * undesirable. However, in order to preserve some of the semantics, * we fall back to check against current free page availability as * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. * * Apart from cpuset, we also have memory policy mechanism that * also determines from which node the kernel will allocate memory * in a NUMA system. So similar to cpuset, we also should consider * the memory policy of the current task. Similar to the description * above. */ if (delta > 0) { if (gather_surplus_pages(h, delta) < 0) goto out; if (delta > allowed_mems_nr(h)) { return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock_irq(&hugetlb_lock); return ret; } static void hugetlb_vm_op_open(struct vm_area_struct *vma) { struct resv_map *resv = vma_resv_map(vma); /* * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } /* * vma_lock structure for sharable mappings is vma specific. * Clear old pointer (if copied via vm_area_dup) and allocate * new structure. Before clearing, make sure vma_lock is not * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock) { if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); } else pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); } else hugetlb_vma_lock_alloc(vma); } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; hugetlb_vma_lock_free(vma); resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(resv, start, end); hugetlb_cgroup_uncharge_counter(resv, start, end); if (reserve) { /* * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ gbl_reserve = hugepage_subpool_put_pages(spool, reserve); hugetlb_acct_memory(h, -gbl_reserve); } kref_put(&resv->refs, resv_map_release); } static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. */ if (addr & ~PUD_MASK) { /* * hugetlb_vm_op_split is called right before we attempt to * split the VMA. We will need to unshare PMDs in the old and * new VMAs, so let's unshare before we split. */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; if (floor >= vma->vm_start && ceil <= vma->vm_end) hugetlb_unshare_pmds(vma, floor, ceil); } return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { return huge_page_size(hstate_vma(vma)); } /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; } /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. * This is because under System V memory model, mappings created via * shmget/shmat with "huge page" specified are backed by hugetlbfs files, * their original vm_ops are overwritten with shm_vm_ops. */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { entry = huge_pte_wrprotect(mk_huge_pte(page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } static void set_huge_ptep_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t entry; entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (vma->vm_flags & VM_WRITE) set_huge_ptep_writable(vma, address, ptep); } bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_migration_entry(swp)) return true; else return false; } bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_hwpoison_entry(swp)) return true; else return false; } static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings the vma lock must be held before * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; } /* * If the pagetables are shared don't copy or take references. * * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with * another vma. So page_count of ptep page is checked instead * to reliably determine whether pte is shared. */ if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { /* * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ swp_entry = make_readable_migration_entry( swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_pte_marker(entry))) { pte_marker marker = copy_pte_marker( pte_to_swp_entry(entry), dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case * where we see pinned hugetlb pages while they're * prone to COW. We need to do the COW earlier during * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ if (!folio_test_anon(pte_folio)) { hugetlb_add_file_rmap(pte_folio); } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, pte_folio, addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); break; } /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; } if (cow) { /* * No need to notify as we are downgrading page * table protection not changing it to point * to a new page. * * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); } if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { hugetlb_vma_unlock_read(src_vma); } return ret; } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; pte_t pte; dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); /* * We don't have to worry about the ordering of src and dst ptlocks * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. */ if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); else if (is_swap_pte(pte)) pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); } if (src_ptl != dst_ptl) spin_unlock(src_ptl); spin_unlock(dst_ptl); } int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { struct hstate *h = hstate_vma(vma); struct address_space *mapping = vma->vm_file->f_mapping; unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* * In case of shared PMDs, we should cover the maximum possible * range. */ flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); } if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); /* * This is a hugetlb vma, all the pte entries should point * to huge page. */ tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; } /* * Migrating hugepage or HWPoisoned hugepage is already * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to * drop the uffd-wp bit in this zap, then replace the * pte with a marker. */ if (pte_swp_uffd_wp_any(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); else huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } page = pte_page(pte); /* * If a reference page is supplied, it is because a specific * page is being unmapped, not a range. Ensure the page we * are about to unmap is the actual page of interest. */ if (ref_page) { if (page != ref_page) { spin_unlock(ptl); continue; } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); /* * Restore the reservation for anonymous page, otherwise the * backing page could be stolen by someone. * If there we are freeing a surplus, do not set the restore * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(page_folio(page))) { folio_set_hugetlb_restore_reserve(page_folio(page)); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } spin_unlock(ptl); /* * Adjust the reservation for the region that will have the * reserve restored. Keep in mind that vma_needs_reservation() changes * resv->adds_in_progress if it succeeds. If this is not done, * do_exit() will not see it, and will keep the reservation * forever. */ if (adjust_reservation) { int rc = vma_needs_reservation(h, vma, address); if (rc < 0) /* Pressumably allocate_file_region_entries failed * to allocate a file_region struct. Clear * hugetlb_restore_reserve so that global reserve * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ folio_clear_hugetlb_restore_reserve(page_folio(page)); else if (rc) vma_add_reservation(h, vma, address); } tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied */ if (ref_page) break; } tlb_end_vma(tlb, vma); /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we * guaranteed that the last refernece would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. * * In theory we could defer the freeing of the PMD pages as well, but * huge_pmd_unshare() relies on the exact page_count for the PMD page to * detect sharing, so we cannot defer the release of the page either. * Instead, do flush now. */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); } void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; adjust_range_if_pmd_sharing_possible(vma, start, end); hugetlb_vma_lock_write(vma); if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); } void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { zap_flags_t zap_flags = details ? details->zap_flags : 0; if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* * Unlock and free the vma lock before releasing i_mmap_rwsem. * When the vma_lock is freed, this makes the vma ineligible * for pmd sharing. And, i_mmap_rwsem is required to set up * pmd sharing. This is important as page tables for this * unmapped range will be asynchrously deleted. If the page * tables are shared, there will be issues when accessed by * someone else. */ __hugetlb_vma_unlock_write_free(vma); } else { hugetlb_vma_unlock_write(vma); } if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } /* * This is called when the original mapper is failing to COW a MAP_PRIVATE * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; pgoff_t pgoff; /* * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; mapping = vma->vm_file->f_mapping; /* * Take the mapping lock for the duration of the table walk. As * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; /* * Shared VMAs have their own reserves and do not affect * MAP_PRIVATE accounting but it is possible that a shared * VMA is using the same page so check and skip such VMAs. */ if (iter_vma->vm_flags & VM_MAYSHARE) continue; /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these * areas. This is because a future no-page fault on this VMA * could insert a zeroed page instead of the data existing * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page, 0); } i_mmap_unlock_write(mapping); } /* * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; /* * Never handle CoW for uffd-wp protected pages. It should be only * handled when the uffd-wp protection is removed. * * Note that only the CoW optimization path (in hugetlb_no_page()) * can trigger this, because hugetlb_fault() will always resolve * uffd-wp bit first. */ if (!unshare && huge_pte_uffd_wp(pte)) return 0; /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. * * Note that we don't rely on the (safer) folio refcount here, because * copying the hugetlb folio when there are unexpected (temporary) * folio references could harm simple fork()+exit() users when * we run out of free hugetlb folios: we would have to kill processes * in scenarios that used to work. As a side effect, there can still * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) set_huge_ptep_maybe_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; } VM_BUG_ON_PAGE(folio_test_anon(old_folio) && PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to * perform a COW due to a shared page count, attempt to satisfy * the allocation without using the existing reserves. The pagecache * page is used to determine if the reserve at this address was * consumed or not. If reserves were used, a partial faulted mapping * at the time of fork() could consume its reserves on COW instead * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) cow_from_owner = true; folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient * huge page pool. To guarantee the original mappers * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * * Reacquire both after unmap operation. */ idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); unmap_ref_private(mm, vma, &old_folio->page, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table * lock, and our job is done. */ delayacct_wpcopy_end(); return 0; } ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } /* * When the original hugepage is shared one, it does not have * anon_vma prepared. */ ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out_release_all; if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* * No restore in case of successful pagetable update (Break COW or * unshare) */ if (new_folio != old_folio) restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; } /* * Return whether there is a pagecache page to back given address within VMA. */ bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); if (IS_ERR(folio)) return false; folio_put(folio); return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); if (unlikely(err)) { __folio_clear_locked(folio); return err; } folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); return 0; } static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, unsigned long reason) { u32 hash; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vmf->vma); hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(vmf, reason); } /* * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte); spin_unlock(ptl); return same; } static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed * COW/unsharing. Warn that such a situation has occurred as it may not * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); goto out; } /* * Use page lock to guard against racing truncation * before we get page_table_lock. */ new_folio = false; folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { /* * Since hugetlb_no_page() was examining pte * without pgtable lock, we need to re-test under * lock because the pte may not be stable and could * have changed from under us. Try to detect * either changed or during-changing ptes and retry * properly when needed. * * Note that userfaultfd is actually fine with * false positives (e.g. caused by pte changed), * but not wrong logical events (e.g. caused by * reading a pte during changing). The latter can * confuse the userspace, so the strictness is very * much preferred. E.g., MISSING event should * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } if (!(vma->vm_flags & VM_MAYSHARE)) { ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; } folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two * tasks from racing to fault in the same page which * could result in false unable to allocate errors. * Page migration does not take the fault mutex, but * does a clear then write of pte's under page table * lock. Page fault code could race with migration, * notice the clear pte and try to allocate a page * here. Before returning error, get ptl and make * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } folio_zero_user(folio, vmf->real_address); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(folio, mapping, vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone * else consumed the reservation since hugetlb * fault mutex is held when add a hugetlb page * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ restore_reserve_on_error(h, vma, vmf->address, folio); folio_put(folio); ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); anon_rmap = 1; } } else { /* * If memory error occurs between mmap() and fault, some process * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside * the spinlock. */ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf->address); } vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_wp(folio, vmf); } spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ if (new_folio) folio_set_hugetlb_migratable(folio); folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() is * the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); goto out; } #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; key[0] = (unsigned long) mapping; key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } #else /* * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } #endif vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { vm_fault_t ret; u32 hash; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), .real_address = address, .flags = flags, .pgoff = vma_hugecache_offset(h, vma, address & huge_page_mask(h)), /* TODO: Track hugetlb faults using vm_fault */ /* * Some fields may not be initialized, be careful as it may * be hard to debug if called functions make assumptions */ }; /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold * until finished with vmf.pte. This prevents huge_pmd_unshare from * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); if (huge_pte_none_mostly(vmf.orig_pte)) { if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { /* This isn't supported in hugetlb. */ ret = VM_FAULT_SIGSEGV; goto out_mutex; } } /* * Other PTE markers should be handled the same way as none PTE. * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); } ret = 0; /* * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this * point, so this check prevents the kernel from going below assuming * that we have an active hugepage in pagecache. This goto expects * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) * check will properly handle it. */ if (!pte_present(vmf.orig_pte)) { if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the * huge_pte_lockptr() later in * migration_entry_wait_huge(). The vma lock will * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. Also lookup the pagecache page now as it is used to * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; goto out_ptl; } folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } vmf.orig_pte = pte_mkyoung(vmf.orig_pte); if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() in * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and * the page is not used after unlocked before returning from the current * page fault. So we are safe from accessing freed page, even if we wait * here without taking refcount. */ if (need_wait_lock) folio_wait_locked(folio); return ret; } #ifdef CONFIG_USERFAULTFD /* * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). */ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); /* * This is used to allocate a temporary hugetlb to hold the copied * content, which will then be copied again to the final hugetlb * consuming a reservation. Set the alloc_fallback to false to indicate * that breaking the per-node hugetlb pool is not allowed in this case. */ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; } /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. */ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct mm_struct *dst_mm = dst_vma->vm_mm; bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); bool wp_enabled = (flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size = huge_page_size(h); int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { spin_unlock(ptl); return -EEXIST; } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); return 0; } if (is_continue) { ret = -EFAULT; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*foliop) { /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { ret = -EEXIST; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } ret = copy_folio_from_user(folio, (const void __user *) src_addr, false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; /* Free the allocated folio which may have * consumed a reservation. */ restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied * contents. */ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); if (!folio) { ret = -ENOMEM; goto out; } *foliop = folio; /* Set the outparam foliop and return to the caller to * copy the contents outside the lock. Don't free the * folio. */ goto out; } } else { if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { folio_put(*foliop); ret = -EEXIST; *foliop = NULL; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; *foliop = NULL; goto out; } ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { folio_put(folio); goto out; } } /* * If we just allocated a new page, we need a memory barrier to ensure * that preceding stores to the page become visible before the * set_pte_at() write. The memory barrier inside __folio_mark_uptodate * is what we need. * * In the case where we have not allocated a new page (is_continue), * the page must already be uptodate. UFFDIO_CONTINUE already includes * an earlier smp_wmb() to ensure that prior stores will be visible * before the set_pte_at() write. */ if (!is_continue) __folio_mark_uptodate(folio); else WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { ret = -EFAULT; if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h))) goto out_release_nounlock; /* * Serialization between remove_inode_hugepages() and * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; if (folio_test_hwpoison(folio)) goto out_release_unlock; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) goto out_release_unlock; if (folio_in_pagecache) hugetlb_add_file_rmap(folio); else hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ _dst_pte = make_huge_pte(dst_vma, &folio->page, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not * supported, but we should still be clear in that this page cannot be * thrown away at will, even if write bit not set. */ _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); if (!is_continue) folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * In the case of shared PMDs, the area to flush could be beyond * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; continue; } /* * Userfaultfd wr-protect requires pgtable * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); if (!ptep) { pages = -ENOMEM; break; } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it * happened due to some reason. */ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); shared_pmd = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); struct page *page = pfn_swap_entry_to_page(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { if (PageAnon(page)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); pages++; } if (uffd_wp) newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { /* * Do nothing on a poison marker; page is * corrupted, permissons do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } else { /* None pte */ if (unlikely(uffd_wp)) /* Safe to modify directly (none->non-present). */ set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } spin_unlock(ptl); } /* * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. If we actually * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new * page. * * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); return false; } /* * vma specific semaphore used for pmd sharing and fault/truncation * synchronization */ hugetlb_vma_lock_alloc(vma); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ if (vm_flags & VM_NORESERVE) return true; /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see * hugetlbfs_get_inode). */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, &regions_needed); } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) goto out_err; chg = to - from; set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) goto out_err; if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); } /* * There must be enough pages in the subpool for the mapping. If * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); if (gbl_reserve < 0) goto out_uncharge_cgroup; /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; /* * Account for the reservations made. Shared mappings record regions * that have reservations as they are shared by multiple VMAs. * When the last VMA disappears, the region map says how much * the reservation was and the page cache tells how much of * the reservation was consumed. Private mappings are per-VMA and * only the consumed reservations are tracked. When the VMA * disappears, the original reservation is the VMA size and the * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); goto out_put_pages; } else if (unlikely(chg > add)) { /* * pages in this range were added to the reserve * map between region_chg and region_add. This * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ long rsv_adjust; /* * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the * reference to h_cg->css. See comment below for detail. */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); } else if (h_cg) { /* * The file_regions will hold their own reference to * h_cg->css. So we should release the reference held * via hugetlb_cgroup_charge_cgroup_rsvd() when we are * done. */ hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; /* * Since this routine can be called in the evict inode path for all * hugetlbfs inodes, resv_map could be NULL. */ if (resv_map) { chg = region_del(resv_map, start, end); /* * region_del() can fail in the rare case where a region * must be split and another region descriptor can not be * allocated. If end == LONG_MAX, it will not fail. */ if (chg < 0) return chg; } spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. * * Note that !resv_map implies freed == 0. So (chg - freed) * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); return 0; } #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) { unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + svma->vm_start; unsigned long sbase = saddr & PUD_MASK; unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the * page table page. * * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || !range_in_vma(svma, sbase, s_end) || !svma->vm_private_data) return 0; return saddr; } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif /* * check on proper vm_flags and page table alignment */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible * shared pmd mappings. */ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* * vma needs to span at least one aligned PUD size, and the range * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ if (*start > v_start) *start = ALIGN_DOWN(*start, PUD_SIZE); if (*end < v_end) *end = ALIGN(*end, PUD_SIZE); } /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; pte_t *pte; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } } if (!spte) goto out; spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); return pte; } /* * unmap huge page backed by shared pte. * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { return NULL; } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } } if (pte) { pte_t pteval = ptep_get_lockless(pte); BUG_ON(pte_present(pteval) && !pte_huge(pteval)); } return pte; } /* * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * * Return: Pointer to page table entry (PUD or PMD) for * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, addr); if (sz == PUD_SIZE) /* must be pud huge, non-present or none */ return (pte_t *)pud; if (!pud_present(*pud)) return NULL; /* must have a valid entry and size to go further */ pmd = pmd_offset(pud, addr); /* must be pmd huge, non-present or none */ return (pte_t *)pmd; } /* * Return a mask that can be used to update an address to the last huge * page in a page table page mapping size. Used to skip non-present * page table entries when linearly scanning address ranges. Architectures * with unique huge page to page table relationships can define their own * version of this routine. */ unsigned long hugetlb_mask_last_page(struct hstate *h) { unsigned long hp_size = huge_page_size(h); if (hp_size == PUD_SIZE) return P4D_SIZE - PUD_SIZE; else if (hp_size == PMD_SIZE) return PUD_SIZE - PMD_SIZE; else return 0UL; } #else /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif return 0UL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /** * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio * @folio: the folio to isolate * @list: the list to add the folio to on success * * Isolate an allocated (refcount > 0) hugetlb folio, marking it as * isolated/non-migratable, and moving it from the active list to the * given list. * * Isolation will fail if @folio is not an allocated hugetlb folio, or if * it is already isolated/non-migratable. * * On success, an additional folio reference is taken that must be dropped * using folio_putback_hugetlb() to undo the isolation. * * Return: True if isolation worked, otherwise False. */ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; } int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { *hugetlb = true; if (folio_test_hugetlb_freed(folio)) ret = 0; else if (folio_test_hugetlb_migratable(folio) || unpoison) ret = folio_try_get(folio); else ret = -EBUSY; } spin_unlock_irq(&hugetlb_lock); return ret; } int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } /** * folio_putback_hugetlb - unisolate a hugetlb folio * @folio: the isolated hugetlb folio * * Putback/un-isolate the hugetlb folio that was previous isolated using * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it * back onto the active list. * * Will drop the additional folio reference obtained through * folio_isolate_hugetlb(). */ void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); set_page_owner_migrate_reason(&new_folio->page, reason); /* * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. * * Also note that we have to transfer the per-node surplus state * here as well otherwise the global surplus count will not match * the per-node's. */ if (folio_test_hugetlb_temporary(new_folio)) { int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); /* * There is no need to transfer the per-node surplus state * when we do not cross the node. */ if (new_nid == old_nid) return; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } spin_unlock_irq(&hugetlb_lock); } /* * Our old folio is isolated and has "migratable" cleared until it * is putback. As migration succeeded, set the new folio "migratable" * and add it to the active list. */ spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(new_folio); list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; if (start >= end) return; flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } /* * This function will unconditionally remove all the shared pmd pgtable entries * within the specific vma for a hugetlbfs memory range. */ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); }
110 84 25 98 1 97 87 87 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum);
1674 1675 1673 4095 4099 1674 1675 12270 12270 12266 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/drivers/net/netconsole.c * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * * This file contains the implementation of an IRQ-safe, crash-safe * kernel console implementation that outputs kernel messages to the * network. * * Modification history: * * 2001-09-17 started by Ingo Molnar. * 2003-08-11 2.6 port by Matt Mackall * simplified options * generic card hooks * works non-modular * 2003-09-07 rewritten with netpoll api */ /**************************************************************** * ****************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/console.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netpoll.h> #include <linux/inet.h> #include <linux/configfs.h> #include <linux/etherdevice.h> #include <linux/u64_stats_sync.h> #include <linux/utsname.h> #include <linux/rtnetlink.h> MODULE_AUTHOR("Matt Mackall <mpm@selenic.com>"); MODULE_DESCRIPTION("Console driver for network interfaces"); MODULE_LICENSE("GPL"); #define MAX_PARAM_LENGTH 256 #define MAX_EXTRADATA_ENTRY_LEN 256 #define MAX_EXTRADATA_VALUE_LEN 200 /* The number 3 comes from userdata entry format characters (' ', '=', '\n') */ #define MAX_EXTRADATA_NAME_LEN (MAX_EXTRADATA_ENTRY_LEN - \ MAX_EXTRADATA_VALUE_LEN - 3) #define MAX_EXTRADATA_ITEMS 16 #define MAX_PRINT_CHUNK 1000 static char config[MAX_PARAM_LENGTH]; module_param_string(netconsole, config, MAX_PARAM_LENGTH, 0); MODULE_PARM_DESC(netconsole, " netconsole=[src-port]@[src-ip]/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr]"); static bool oops_only; module_param(oops_only, bool, 0600); MODULE_PARM_DESC(oops_only, "Only log oops messages"); #define NETCONSOLE_PARAM_TARGET_PREFIX "cmdline" #ifndef MODULE static int __init option_setup(char *opt) { strscpy(config, opt, MAX_PARAM_LENGTH); return 1; } __setup("netconsole=", option_setup); #endif /* MODULE */ /* Linked list of all configured targets */ static LIST_HEAD(target_list); /* target_cleanup_list is used to track targets that need to be cleaned outside * of target_list_lock. It should be cleaned in the same function it is * populated. */ static LIST_HEAD(target_cleanup_list); /* This needs to be a spinlock because write_msg() cannot sleep */ static DEFINE_SPINLOCK(target_list_lock); /* This needs to be a mutex because netpoll_cleanup might sleep */ static DEFINE_MUTEX(target_cleanup_list_lock); /* * Console driver for extended netconsoles. Registered on the first use to * avoid unnecessarily enabling ext message formatting. */ static struct console netconsole_ext; struct netconsole_target_stats { u64_stats_t xmit_drop_count; u64_stats_t enomem_count; struct u64_stats_sync syncp; }; /* Features enabled in sysdata. Contrary to userdata, this data is populated by * the kernel. The fields are designed as bitwise flags, allowing multiple * features to be set in sysdata_fields. */ enum sysdata_feature { /* Populate the CPU that sends the message */ SYSDATA_CPU_NR = BIT(0), /* Populate the task name (as in current->comm) in sysdata */ SYSDATA_TASKNAME = BIT(1), /* Kernel release/version as part of sysdata */ SYSDATA_RELEASE = BIT(2), }; /** * struct netconsole_target - Represents a configured netconsole target. * @list: Links this target into the target_list. * @group: Links us into the configfs subsystem hierarchy. * @userdata_group: Links to the userdata configfs hierarchy * @extradata_complete: Cached, formatted string of append * @userdata_length: String length of usedata in extradata_complete. * @sysdata_fields: Sysdata features enabled. * @stats: Packet send stats for the target. Used for debugging. * @enabled: On / off knob to enable / disable target. * Visible from userspace (read-write). * We maintain a strict 1:1 correspondence between this and * whether the corresponding netpoll is active or inactive. * Also, other parameters of a target may be modified at * runtime only when it is disabled (enabled == 0). * @extended: Denotes whether console is extended or not. * @release: Denotes whether kernel release version should be prepended * to the message. Depends on extended console. * @np: The netpoll structure for this target. * Contains the other userspace visible parameters: * dev_name (read-write) * local_port (read-write) * remote_port (read-write) * local_ip (read-write) * remote_ip (read-write) * local_mac (read-only) * remote_mac (read-write) * @buf: The buffer used to send the full msg to the network stack */ struct netconsole_target { struct list_head list; #ifdef CONFIG_NETCONSOLE_DYNAMIC struct config_group group; struct config_group userdata_group; char extradata_complete[MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS]; size_t userdata_length; /* bit-wise with sysdata_feature bits */ u32 sysdata_fields; #endif struct netconsole_target_stats stats; bool enabled; bool extended; bool release; struct netpoll np; /* protected by target_list_lock */ char buf[MAX_PRINT_CHUNK]; }; #ifdef CONFIG_NETCONSOLE_DYNAMIC static struct configfs_subsystem netconsole_subsys; static DEFINE_MUTEX(dynamic_netconsole_mutex); static int __init dynamic_netconsole_init(void) { config_group_init(&netconsole_subsys.su_group); mutex_init(&netconsole_subsys.su_mutex); return configfs_register_subsystem(&netconsole_subsys); } static void __exit dynamic_netconsole_exit(void) { configfs_unregister_subsystem(&netconsole_subsys); } /* * Targets that were created by parsing the boot/module option string * do not exist in the configfs hierarchy (and have NULL names) and will * never go away, so make these a no-op for them. */ static void netconsole_target_get(struct netconsole_target *nt) { if (config_item_name(&nt->group.cg_item)) config_group_get(&nt->group); } static void netconsole_target_put(struct netconsole_target *nt) { if (config_item_name(&nt->group.cg_item)) config_group_put(&nt->group); } #else /* !CONFIG_NETCONSOLE_DYNAMIC */ static int __init dynamic_netconsole_init(void) { return 0; } static void __exit dynamic_netconsole_exit(void) { } /* * No danger of targets going away from under us when dynamic * reconfigurability is off. */ static void netconsole_target_get(struct netconsole_target *nt) { } static void netconsole_target_put(struct netconsole_target *nt) { } static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Allocate and initialize with defaults. * Note that these targets get their config_item fields zeroed-out. */ static struct netconsole_target *alloc_and_init(void) { struct netconsole_target *nt; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return nt; if (IS_ENABLED(CONFIG_NETCONSOLE_EXTENDED_LOG)) nt->extended = true; if (IS_ENABLED(CONFIG_NETCONSOLE_PREPEND_RELEASE)) nt->release = true; nt->np.name = "netconsole"; strscpy(nt->np.dev_name, "eth0", IFNAMSIZ); nt->np.local_port = 6665; nt->np.remote_port = 6666; eth_broadcast_addr(nt->np.remote_mac); return nt; } /* Clean up every target in the cleanup_list and move the clean targets back to * the main target_list. */ static void netconsole_process_cleanups_core(void) { struct netconsole_target *nt, *tmp; unsigned long flags; /* The cleanup needs RTNL locked */ ASSERT_RTNL(); mutex_lock(&target_cleanup_list_lock); list_for_each_entry_safe(nt, tmp, &target_cleanup_list, list) { /* all entries in the cleanup_list needs to be disabled */ WARN_ON_ONCE(nt->enabled); do_netpoll_cleanup(&nt->np); /* moved the cleaned target to target_list. Need to hold both * locks */ spin_lock_irqsave(&target_list_lock, flags); list_move(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); } WARN_ON_ONCE(!list_empty(&target_cleanup_list)); mutex_unlock(&target_cleanup_list_lock); } #ifdef CONFIG_NETCONSOLE_DYNAMIC /* * Our subsystem hierarchy is: * * /sys/kernel/config/netconsole/ * | * <target>/ * | enabled * | release * | dev_name * | local_port * | remote_port * | local_ip * | remote_ip * | local_mac * | remote_mac * | transmit_errors * | userdata/ * | <key>/ * | value * | ... * | * <target>/... */ static struct netconsole_target *to_target(struct config_item *item) { struct config_group *cfg_group; cfg_group = to_config_group(item); if (!cfg_group) return NULL; return container_of(to_config_group(item), struct netconsole_target, group); } /* Do the list cleanup with the rtnl lock hold. rtnl lock is necessary because * netdev might be cleaned-up by calling __netpoll_cleanup(), */ static void netconsole_process_cleanups(void) { /* rtnl lock is called here, because it has precedence over * target_cleanup_list_lock mutex and target_cleanup_list */ rtnl_lock(); netconsole_process_cleanups_core(); rtnl_unlock(); } /* Get rid of possible trailing newline, returning the new length */ static void trim_newline(char *s, size_t maxlen) { size_t len; len = strnlen(s, maxlen); if (s[len - 1] == '\n') s[len - 1] = '\0'; } /* * Attribute operations for netconsole_target. */ static ssize_t enabled_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->enabled); } static ssize_t extended_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->extended); } static ssize_t release_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->release); } static ssize_t dev_name_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%s\n", to_target(item)->np.dev_name); } static ssize_t local_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.local_port); } static ssize_t remote_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.remote_port); } static ssize_t local_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.local_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.local_ip); } static ssize_t remote_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.remote_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.remote_ip); } static ssize_t local_mac_show(struct config_item *item, char *buf) { struct net_device *dev = to_target(item)->np.dev; static const u8 bcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return sysfs_emit(buf, "%pM\n", dev ? dev->dev_addr : bcast); } static ssize_t remote_mac_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac); } static ssize_t transmit_errors_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); u64 xmit_drop_count, enomem_count; unsigned int start; do { start = u64_stats_fetch_begin(&nt->stats.syncp); xmit_drop_count = u64_stats_read(&nt->stats.xmit_drop_count); enomem_count = u64_stats_read(&nt->stats.enomem_count); } while (u64_stats_fetch_retry(&nt->stats.syncp, start)); return sysfs_emit(buf, "%llu\n", xmit_drop_count + enomem_count); } /* configfs helper to display if cpu_nr sysdata feature is enabled */ static ssize_t sysdata_cpu_nr_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool cpu_nr_enabled; mutex_lock(&dynamic_netconsole_mutex); cpu_nr_enabled = !!(nt->sysdata_fields & SYSDATA_CPU_NR); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", cpu_nr_enabled); } /* configfs helper to display if taskname sysdata feature is enabled */ static ssize_t sysdata_taskname_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool taskname_enabled; mutex_lock(&dynamic_netconsole_mutex); taskname_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", taskname_enabled); } static ssize_t sysdata_release_enabled_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item->ci_parent); bool release_enabled; mutex_lock(&dynamic_netconsole_mutex); release_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME); mutex_unlock(&dynamic_netconsole_mutex); return sysfs_emit(buf, "%d\n", release_enabled); } /* * This one is special -- targets created through the configfs interface * are not enabled (and the corresponding netpoll activated) by default. * The user is expected to set the desired parameters first (which * would enable him to dynamically add new netpoll targets for new * network interfaces as and when they come up). */ static ssize_t enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); unsigned long flags; bool enabled; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); ret = kstrtobool(buf, &enabled); if (ret) goto out_unlock; ret = -EINVAL; if (enabled == nt->enabled) { pr_info("network logging has already %s\n", nt->enabled ? "started" : "stopped"); goto out_unlock; } if (enabled) { /* true */ if (nt->release && !nt->extended) { pr_err("Not enabling netconsole. Release feature requires extended log message"); goto out_unlock; } if (nt->extended && !console_is_registered(&netconsole_ext)) register_console(&netconsole_ext); /* * Skip netpoll_parse_options() -- all the attributes are * already configured via configfs. Just print them out. */ netpoll_print_options(&nt->np); ret = netpoll_setup(&nt->np); if (ret) goto out_unlock; nt->enabled = true; pr_info("network logging started\n"); } else { /* false */ /* We need to disable the netconsole before cleaning it up * otherwise we might end up in write_msg() with * nt->np.dev == NULL and nt->enabled == true */ mutex_lock(&target_cleanup_list_lock); spin_lock_irqsave(&target_list_lock, flags); nt->enabled = false; /* Remove the target from the list, while holding * target_list_lock */ list_move(&nt->list, &target_cleanup_list); spin_unlock_irqrestore(&target_list_lock, flags); mutex_unlock(&target_cleanup_list_lock); } ret = strnlen(buf, count); /* Deferred cleanup */ netconsole_process_cleanups(); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t release_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool release; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); ret = -EINVAL; goto out_unlock; } ret = kstrtobool(buf, &release); if (ret) goto out_unlock; nt->release = release; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t extended_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool extended; ssize_t ret; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); ret = -EINVAL; goto out_unlock; } ret = kstrtobool(buf, &extended); if (ret) goto out_unlock; nt->extended = extended; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t dev_name_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } strscpy(nt->np.dev_name, buf, IFNAMSIZ); trim_newline(nt->np.dev_name, IFNAMSIZ); mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); } static ssize_t local_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ret = kstrtou16(buf, 10, &nt->np.local_port); if (ret < 0) goto out_unlock; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t remote_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } ret = kstrtou16(buf, 10, &nt->np.remote_port); if (ret < 0) goto out_unlock; ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t local_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } if (strnchr(buf, count, ':')) { const char *end; if (in6_pton(buf, count, nt->np.local_ip.in6.s6_addr, -1, &end) > 0) { if (*end && *end != '\n') { pr_err("invalid IPv6 address at: <%c>\n", *end); goto out_unlock; } nt->np.ipv6 = true; } else goto out_unlock; } else { if (!nt->np.ipv6) nt->np.local_ip.ip = in_aton(buf); else goto out_unlock; } ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t remote_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } if (strnchr(buf, count, ':')) { const char *end; if (in6_pton(buf, count, nt->np.remote_ip.in6.s6_addr, -1, &end) > 0) { if (*end && *end != '\n') { pr_err("invalid IPv6 address at: <%c>\n", *end); goto out_unlock; } nt->np.ipv6 = true; } else goto out_unlock; } else { if (!nt->np.ipv6) nt->np.remote_ip.ip = in_aton(buf); else goto out_unlock; } ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* Count number of entries we have in extradata. * This is important because the extradata_complete only supports * MAX_EXTRADATA_ITEMS entries. Before enabling any new {user,sys}data * feature, number of entries needs to checked for available space. */ static size_t count_extradata_entries(struct netconsole_target *nt) { size_t entries; /* Userdata entries */ entries = list_count_nodes(&nt->userdata_group.cg_children); /* Plus sysdata entries */ if (nt->sysdata_fields & SYSDATA_CPU_NR) entries += 1; if (nt->sysdata_fields & SYSDATA_TASKNAME) entries += 1; if (nt->sysdata_fields & SYSDATA_RELEASE) entries += 1; return entries; } static ssize_t remote_mac_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); u8 remote_mac[ETH_ALEN]; ssize_t ret = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->group.cg_item)); goto out_unlock; } if (!mac_pton(buf, remote_mac)) goto out_unlock; if (buf[MAC_ADDR_STR_LEN] && buf[MAC_ADDR_STR_LEN] != '\n') goto out_unlock; memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN); ret = strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } struct userdatum { struct config_item item; char value[MAX_EXTRADATA_VALUE_LEN]; }; static struct userdatum *to_userdatum(struct config_item *item) { return container_of(item, struct userdatum, item); } struct userdata { struct config_group group; }; static struct userdata *to_userdata(struct config_item *item) { return container_of(to_config_group(item), struct userdata, group); } static struct netconsole_target *userdata_to_target(struct userdata *ud) { struct config_group *netconsole_group; netconsole_group = to_config_group(ud->group.cg_item.ci_parent); return to_target(&netconsole_group->cg_item); } static ssize_t userdatum_value_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%s\n", &(to_userdatum(item)->value[0])); } static void update_userdata(struct netconsole_target *nt) { int complete_idx = 0, child_count = 0; struct list_head *entry; /* Clear the current string in case the last userdatum was deleted */ nt->userdata_length = 0; nt->extradata_complete[0] = 0; list_for_each(entry, &nt->userdata_group.cg_children) { struct userdatum *udm_item; struct config_item *item; if (WARN_ON_ONCE(child_count >= MAX_EXTRADATA_ITEMS)) break; child_count++; item = container_of(entry, struct config_item, ci_entry); udm_item = to_userdatum(item); /* Skip userdata with no value set */ if (strnlen(udm_item->value, MAX_EXTRADATA_VALUE_LEN) == 0) continue; /* This doesn't overflow extradata_complete since it will write * one entry length (1/MAX_EXTRADATA_ITEMS long), entry count is * checked to not exceed MAX items with child_count above */ complete_idx += scnprintf(&nt->extradata_complete[complete_idx], MAX_EXTRADATA_ENTRY_LEN, " %s=%s\n", item->ci_name, udm_item->value); } nt->userdata_length = strnlen(nt->extradata_complete, sizeof(nt->extradata_complete)); } static ssize_t userdatum_value_store(struct config_item *item, const char *buf, size_t count) { struct userdatum *udm = to_userdatum(item); struct netconsole_target *nt; struct userdata *ud; ssize_t ret; if (count > MAX_EXTRADATA_VALUE_LEN) return -EMSGSIZE; mutex_lock(&dynamic_netconsole_mutex); ret = strscpy(udm->value, buf, sizeof(udm->value)); if (ret < 0) goto out_unlock; trim_newline(udm->value, sizeof(udm->value)); ud = to_userdata(item->ci_parent); nt = userdata_to_target(ud); update_userdata(nt); ret = count; out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* disable_sysdata_feature - Disable sysdata feature and clean sysdata * @nt: target that is disabling the feature * @feature: feature being disabled */ static void disable_sysdata_feature(struct netconsole_target *nt, enum sysdata_feature feature) { nt->sysdata_fields &= ~feature; nt->extradata_complete[nt->userdata_length] = 0; } static ssize_t sysdata_release_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool release_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &release_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_RELEASE); if (release_enabled == curr) goto unlock_ok; if (release_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { ret = -ENOSPC; goto unlock; } if (release_enabled) nt->sysdata_fields |= SYSDATA_RELEASE; else disable_sysdata_feature(nt, SYSDATA_RELEASE); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } static ssize_t sysdata_taskname_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool taskname_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &taskname_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_TASKNAME); if (taskname_enabled == curr) goto unlock_ok; if (taskname_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { ret = -ENOSPC; goto unlock; } if (taskname_enabled) nt->sysdata_fields |= SYSDATA_TASKNAME; else disable_sysdata_feature(nt, SYSDATA_TASKNAME); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } /* configfs helper to sysdata cpu_nr feature */ static ssize_t sysdata_cpu_nr_enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item->ci_parent); bool cpu_nr_enabled, curr; ssize_t ret; ret = kstrtobool(buf, &cpu_nr_enabled); if (ret) return ret; mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_CPU_NR); if (cpu_nr_enabled == curr) /* no change requested */ goto unlock_ok; if (cpu_nr_enabled && count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) { /* user wants the new feature, but there is no space in the * buffer. */ ret = -ENOSPC; goto unlock; } if (cpu_nr_enabled) nt->sysdata_fields |= SYSDATA_CPU_NR; else /* This is special because extradata_complete might have * remaining data from previous sysdata, and it needs to be * cleaned. */ disable_sysdata_feature(nt, SYSDATA_CPU_NR); unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); return ret; } CONFIGFS_ATTR(userdatum_, value); CONFIGFS_ATTR(sysdata_, cpu_nr_enabled); CONFIGFS_ATTR(sysdata_, taskname_enabled); CONFIGFS_ATTR(sysdata_, release_enabled); static struct configfs_attribute *userdatum_attrs[] = { &userdatum_attr_value, NULL, }; static void userdatum_release(struct config_item *item) { kfree(to_userdatum(item)); } static struct configfs_item_operations userdatum_ops = { .release = userdatum_release, }; static const struct config_item_type userdatum_type = { .ct_item_ops = &userdatum_ops, .ct_attrs = userdatum_attrs, .ct_owner = THIS_MODULE, }; static struct config_item *userdatum_make_item(struct config_group *group, const char *name) { struct netconsole_target *nt; struct userdatum *udm; struct userdata *ud; if (strlen(name) > MAX_EXTRADATA_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ud = to_userdata(&group->cg_item); nt = userdata_to_target(ud); if (count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) return ERR_PTR(-ENOSPC); udm = kzalloc(sizeof(*udm), GFP_KERNEL); if (!udm) return ERR_PTR(-ENOMEM); config_item_init_type_name(&udm->item, name, &userdatum_type); return &udm->item; } static void userdatum_drop(struct config_group *group, struct config_item *item) { struct netconsole_target *nt; struct userdata *ud; ud = to_userdata(&group->cg_item); nt = userdata_to_target(ud); mutex_lock(&dynamic_netconsole_mutex); update_userdata(nt); config_item_put(item); mutex_unlock(&dynamic_netconsole_mutex); } static struct configfs_attribute *userdata_attrs[] = { &sysdata_attr_cpu_nr_enabled, &sysdata_attr_taskname_enabled, &sysdata_attr_release_enabled, NULL, }; static struct configfs_group_operations userdata_ops = { .make_item = userdatum_make_item, .drop_item = userdatum_drop, }; static const struct config_item_type userdata_type = { .ct_item_ops = &userdatum_ops, .ct_group_ops = &userdata_ops, .ct_attrs = userdata_attrs, .ct_owner = THIS_MODULE, }; CONFIGFS_ATTR(, enabled); CONFIGFS_ATTR(, extended); CONFIGFS_ATTR(, dev_name); CONFIGFS_ATTR(, local_port); CONFIGFS_ATTR(, remote_port); CONFIGFS_ATTR(, local_ip); CONFIGFS_ATTR(, remote_ip); CONFIGFS_ATTR_RO(, local_mac); CONFIGFS_ATTR(, remote_mac); CONFIGFS_ATTR(, release); CONFIGFS_ATTR_RO(, transmit_errors); static struct configfs_attribute *netconsole_target_attrs[] = { &attr_enabled, &attr_extended, &attr_release, &attr_dev_name, &attr_local_port, &attr_remote_port, &attr_local_ip, &attr_remote_ip, &attr_local_mac, &attr_remote_mac, &attr_transmit_errors, NULL, }; /* * Item operations and type for netconsole_target. */ static void netconsole_target_release(struct config_item *item) { kfree(to_target(item)); } static struct configfs_item_operations netconsole_target_item_ops = { .release = netconsole_target_release, }; static const struct config_item_type netconsole_target_type = { .ct_attrs = netconsole_target_attrs, .ct_item_ops = &netconsole_target_item_ops, .ct_owner = THIS_MODULE, }; static void init_target_config_group(struct netconsole_target *nt, const char *name) { config_group_init_type_name(&nt->group, name, &netconsole_target_type); config_group_init_type_name(&nt->userdata_group, "userdata", &userdata_type); configfs_add_default_group(&nt->userdata_group, &nt->group); } static struct netconsole_target *find_cmdline_target(const char *name) { struct netconsole_target *nt, *ret = NULL; unsigned long flags; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!strcmp(nt->group.cg_item.ci_name, name)) { ret = nt; break; } } spin_unlock_irqrestore(&target_list_lock, flags); return ret; } /* * Group operations and type for netconsole_subsys. */ static struct config_group *make_netconsole_target(struct config_group *group, const char *name) { struct netconsole_target *nt; unsigned long flags; /* Checking if a target by this name was created at boot time. If so, * attach a configfs entry to that target. This enables dynamic * control. */ if (!strncmp(name, NETCONSOLE_PARAM_TARGET_PREFIX, strlen(NETCONSOLE_PARAM_TARGET_PREFIX))) { nt = find_cmdline_target(name); if (nt) { init_target_config_group(nt, name); return &nt->group; } } nt = alloc_and_init(); if (!nt) return ERR_PTR(-ENOMEM); /* Initialize the config_group member */ init_target_config_group(nt, name); /* Adding, but it is disabled */ spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); return &nt->group; } static void drop_netconsole_target(struct config_group *group, struct config_item *item) { unsigned long flags; struct netconsole_target *nt = to_target(item); spin_lock_irqsave(&target_list_lock, flags); list_del(&nt->list); spin_unlock_irqrestore(&target_list_lock, flags); /* * The target may have never been enabled, or was manually disabled * before being removed so netpoll may have already been cleaned up. */ if (nt->enabled) netpoll_cleanup(&nt->np); config_item_put(&nt->group.cg_item); } static struct configfs_group_operations netconsole_subsys_group_ops = { .make_group = make_netconsole_target, .drop_item = drop_netconsole_target, }; static const struct config_item_type netconsole_subsys_type = { .ct_group_ops = &netconsole_subsys_group_ops, .ct_owner = THIS_MODULE, }; /* The netconsole configfs subsystem */ static struct configfs_subsystem netconsole_subsys = { .su_group = { .cg_item = { .ci_namebuf = "netconsole", .ci_type = &netconsole_subsys_type, }, }, }; static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { char target_name[16]; snprintf(target_name, sizeof(target_name), "%s%d", NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count); init_target_config_group(nt, target_name); } static int sysdata_append_cpu_nr(struct netconsole_target *nt, int offset) { /* Append cpu=%d at extradata_complete after userdata str */ return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " cpu=%u\n", raw_smp_processor_id()); } static int sysdata_append_taskname(struct netconsole_target *nt, int offset) { return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " taskname=%s\n", current->comm); } static int sysdata_append_release(struct netconsole_target *nt, int offset) { return scnprintf(&nt->extradata_complete[offset], MAX_EXTRADATA_ENTRY_LEN, " release=%s\n", init_utsname()->release); } /* * prepare_extradata - append sysdata at extradata_complete in runtime * @nt: target to send message to */ static int prepare_extradata(struct netconsole_target *nt) { u32 fields = SYSDATA_CPU_NR | SYSDATA_TASKNAME; int extradata_len; /* userdata was appended when configfs write helper was called * by update_userdata(). */ extradata_len = nt->userdata_length; if (!(nt->sysdata_fields & fields)) goto out; if (nt->sysdata_fields & SYSDATA_CPU_NR) extradata_len += sysdata_append_cpu_nr(nt, extradata_len); if (nt->sysdata_fields & SYSDATA_TASKNAME) extradata_len += sysdata_append_taskname(nt, extradata_len); if (nt->sysdata_fields & SYSDATA_RELEASE) extradata_len += sysdata_append_release(nt, extradata_len); WARN_ON_ONCE(extradata_len > MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS); out: return extradata_len; } #else /* CONFIG_NETCONSOLE_DYNAMIC not set */ static int prepare_extradata(struct netconsole_target *nt) { return 0; } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Handle network interface device notifications */ static int netconsole_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { unsigned long flags; struct netconsole_target *nt, *tmp; struct net_device *dev = netdev_notifier_info_to_dev(ptr); bool stopped = false; if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER || event == NETDEV_RELEASE || event == NETDEV_JOIN)) goto done; mutex_lock(&target_cleanup_list_lock); spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry_safe(nt, tmp, &target_list, list) { netconsole_target_get(nt); if (nt->np.dev == dev) { switch (event) { case NETDEV_CHANGENAME: strscpy(nt->np.dev_name, dev->name, IFNAMSIZ); break; case NETDEV_RELEASE: case NETDEV_JOIN: case NETDEV_UNREGISTER: nt->enabled = false; list_move(&nt->list, &target_cleanup_list); stopped = true; } } netconsole_target_put(nt); } spin_unlock_irqrestore(&target_list_lock, flags); mutex_unlock(&target_cleanup_list_lock); if (stopped) { const char *msg = "had an event"; switch (event) { case NETDEV_UNREGISTER: msg = "unregistered"; break; case NETDEV_RELEASE: msg = "released slaves"; break; case NETDEV_JOIN: msg = "is joining a master device"; break; } pr_info("network logging stopped on interface %s as it %s\n", dev->name, msg); } /* Process target_cleanup_list entries. By the end, target_cleanup_list * should be empty */ netconsole_process_cleanups_core(); done: return NOTIFY_DONE; } static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; /** * send_udp - Wrapper for netpoll_send_udp that counts errors * @nt: target to send message to * @msg: message to send * @len: length of message * * Calls netpoll_send_udp and classifies the return value. If an error * occurred it increments statistics in nt->stats accordingly. * Only calls netpoll_send_udp if CONFIG_NETCONSOLE_DYNAMIC is disabled. */ static void send_udp(struct netconsole_target *nt, const char *msg, int len) { int result = netpoll_send_udp(&nt->np, msg, len); if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) { if (result == NET_XMIT_DROP) { u64_stats_update_begin(&nt->stats.syncp); u64_stats_inc(&nt->stats.xmit_drop_count); u64_stats_update_end(&nt->stats.syncp); } else if (result == -ENOMEM) { u64_stats_update_begin(&nt->stats.syncp); u64_stats_inc(&nt->stats.enomem_count); u64_stats_update_end(&nt->stats.syncp); } } } static void send_msg_no_fragmentation(struct netconsole_target *nt, const char *msg, int msg_len, int release_len) { const char *extradata = NULL; const char *release; #ifdef CONFIG_NETCONSOLE_DYNAMIC extradata = nt->extradata_complete; #endif if (release_len) { release = init_utsname()->release; scnprintf(nt->buf, MAX_PRINT_CHUNK, "%s,%s", release, msg); msg_len += release_len; } else { memcpy(nt->buf, msg, msg_len); } if (extradata) msg_len += scnprintf(&nt->buf[msg_len], MAX_PRINT_CHUNK - msg_len, "%s", extradata); send_udp(nt, nt->buf, msg_len); } static void append_release(char *buf) { const char *release; release = init_utsname()->release; scnprintf(buf, MAX_PRINT_CHUNK, "%s,", release); } static void send_fragmented_body(struct netconsole_target *nt, const char *msgbody, int header_len, int msgbody_len, int extradata_len) { int sent_extradata, preceding_bytes; const char *extradata = NULL; int body_len, offset = 0; #ifdef CONFIG_NETCONSOLE_DYNAMIC extradata = nt->extradata_complete; #endif /* body_len represents the number of bytes that will be sent. This is * bigger than MAX_PRINT_CHUNK, thus, it will be split in multiple * packets */ body_len = msgbody_len + extradata_len; /* In each iteration of the while loop below, we send a packet * containing the header and a portion of the body. The body is * composed of two parts: msgbody and extradata. We keep track of how * many bytes have been sent so far using the offset variable, which * ranges from 0 to the total length of the body. */ while (offset < body_len) { int this_header = header_len; bool msgbody_written = false; int this_offset = 0; int this_chunk = 0; this_header += scnprintf(nt->buf + this_header, MAX_PRINT_CHUNK - this_header, ",ncfrag=%d/%d;", offset, body_len); /* Not all msgbody data has been written yet */ if (offset < msgbody_len) { this_chunk = min(msgbody_len - offset, MAX_PRINT_CHUNK - this_header); if (WARN_ON_ONCE(this_chunk <= 0)) return; memcpy(nt->buf + this_header, msgbody + offset, this_chunk); this_offset += this_chunk; } /* msgbody was finally written, either in the previous * messages and/or in the current buf. Time to write * the extradata. */ msgbody_written |= offset + this_offset >= msgbody_len; /* Msg body is fully written and there is pending extradata to * write, append extradata in this chunk */ if (msgbody_written && offset + this_offset < body_len) { /* Track how much user data was already sent. First * time here, sent_userdata is zero */ sent_extradata = (offset + this_offset) - msgbody_len; /* offset of bytes used in current buf */ preceding_bytes = this_chunk + this_header; if (WARN_ON_ONCE(sent_extradata < 0)) return; this_chunk = min(extradata_len - sent_extradata, MAX_PRINT_CHUNK - preceding_bytes); if (WARN_ON_ONCE(this_chunk < 0)) /* this_chunk could be zero if all the previous * message used all the buffer. This is not a * problem, extradata will be sent in the next * iteration */ return; memcpy(nt->buf + this_header + this_offset, extradata + sent_extradata, this_chunk); this_offset += this_chunk; } send_udp(nt, nt->buf, this_header + this_offset); offset += this_offset; } } static void send_msg_fragmented(struct netconsole_target *nt, const char *msg, int msg_len, int release_len, int extradata_len) { int header_len, msgbody_len; const char *msgbody; /* need to insert extra header fields, detect header and msgbody */ msgbody = memchr(msg, ';', msg_len); if (WARN_ON_ONCE(!msgbody)) return; header_len = msgbody - msg; msgbody_len = msg_len - header_len - 1; msgbody++; /* * Transfer multiple chunks with the following extra header. * "ncfrag=<byte-offset>/<total-bytes>" */ if (release_len) append_release(nt->buf); /* Copy the header into the buffer */ memcpy(nt->buf + release_len, msg, header_len); header_len += release_len; /* for now on, the header will be persisted, and the msgbody * will be replaced */ send_fragmented_body(nt, msgbody, header_len, msgbody_len, extradata_len); } /** * send_ext_msg_udp - send extended log message to target * @nt: target to send message to * @msg: extended log message to send * @msg_len: length of message * * Transfer extended log @msg to @nt. If @msg is longer than * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with * ncfrag header field added to identify them. */ static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg, int msg_len) { int release_len = 0; int extradata_len; extradata_len = prepare_extradata(nt); if (nt->release) release_len = strlen(init_utsname()->release) + 1; if (msg_len + release_len + extradata_len <= MAX_PRINT_CHUNK) return send_msg_no_fragmentation(nt, msg, msg_len, release_len); return send_msg_fragmented(nt, msg, msg_len, release_len, extradata_len); } static void write_ext_msg(struct console *con, const char *msg, unsigned int len) { struct netconsole_target *nt; unsigned long flags; if ((oops_only && !oops_in_progress) || list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) if (nt->extended && nt->enabled && netif_running(nt->np.dev)) send_ext_msg_udp(nt, msg, len); spin_unlock_irqrestore(&target_list_lock, flags); } static void write_msg(struct console *con, const char *msg, unsigned int len) { int frag, left; unsigned long flags; struct netconsole_target *nt; const char *tmp; if (oops_only && !oops_in_progress) return; /* Avoid taking lock and disabling interrupts unnecessarily */ if (list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) { /* * We nest this inside the for-each-target loop above * so that we're able to get as much logging out to * at least one target if we die inside here, instead * of unnecessarily keeping all targets in lock-step. */ tmp = msg; for (left = len; left;) { frag = min(left, MAX_PRINT_CHUNK); send_udp(nt, tmp, frag); tmp += frag; left -= frag; } } } spin_unlock_irqrestore(&target_list_lock, flags); } /* Allocate new target (from boot/module param) and setup netpoll for it */ static struct netconsole_target *alloc_param_target(char *target_config, int cmdline_count) { struct netconsole_target *nt; int err; nt = alloc_and_init(); if (!nt) { err = -ENOMEM; goto fail; } if (*target_config == '+') { nt->extended = true; target_config++; } if (*target_config == 'r') { if (!nt->extended) { pr_err("Netconsole configuration error. Release feature requires extended log message"); err = -EINVAL; goto fail; } nt->release = true; target_config++; } /* Parse parameters and setup netpoll */ err = netpoll_parse_options(&nt->np, target_config); if (err) goto fail; err = netpoll_setup(&nt->np); if (err) { pr_err("Not enabling netconsole for %s%d. Netpoll setup failed\n", NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count); if (!IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) /* only fail if dynamic reconfiguration is set, * otherwise, keep the target in the list, but disabled. */ goto fail; } else { nt->enabled = true; } populate_configfs_item(nt, cmdline_count); return nt; fail: kfree(nt); return ERR_PTR(err); } /* Cleanup netpoll for given target (from boot/module param) and free it */ static void free_param_target(struct netconsole_target *nt) { netpoll_cleanup(&nt->np); kfree(nt); } static struct console netconsole_ext = { .name = "netcon_ext", .flags = CON_ENABLED | CON_EXTENDED, .write = write_ext_msg, }; static struct console netconsole = { .name = "netcon", .flags = CON_ENABLED, .write = write_msg, }; static int __init init_netconsole(void) { int err; struct netconsole_target *nt, *tmp; unsigned int count = 0; bool extended = false; unsigned long flags; char *target_config; char *input = config; if (strnlen(input, MAX_PARAM_LENGTH)) { while ((target_config = strsep(&input, ";"))) { nt = alloc_param_target(target_config, count); if (IS_ERR(nt)) { if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) continue; err = PTR_ERR(nt); goto fail; } /* Dump existing printks when we register */ if (nt->extended) { extended = true; netconsole_ext.flags |= CON_PRINTBUFFER; } else { netconsole.flags |= CON_PRINTBUFFER; } spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); count++; } } err = register_netdevice_notifier(&netconsole_netdev_notifier); if (err) goto fail; err = dynamic_netconsole_init(); if (err) goto undonotifier; if (extended) register_console(&netconsole_ext); register_console(&netconsole); pr_info("network logging started\n"); return err; undonotifier: unregister_netdevice_notifier(&netconsole_netdev_notifier); fail: pr_err("cleaning up\n"); /* * Remove all targets and destroy them (only targets created * from the boot/module option exist here). Skipping the list * lock is safe here, and netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } return err; } static void __exit cleanup_netconsole(void) { struct netconsole_target *nt, *tmp; if (console_is_registered(&netconsole_ext)) unregister_console(&netconsole_ext); unregister_console(&netconsole); dynamic_netconsole_exit(); unregister_netdevice_notifier(&netconsole_netdev_notifier); /* * Targets created via configfs pin references on our module * and would first be rmdir(2)'ed from userspace. We reach * here only when they are already destroyed, and only those * created from the boot/module option are left, so remove and * destroy them. Skipping the list lock is safe here, and * netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } } /* * Use late_initcall to ensure netconsole is * initialized after network device driver if built-in. * * late_initcall() and module_init() are identical if built as module. */ late_initcall(init_netconsole); module_exit(cleanup_netconsole);
27 2 1 1 2 1 1 1 4 14 5 9 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_meta.h> #include <net/netfilter/nf_tables_offload.h> #include <linux/tcp.h> #include <linux/udp.h> #include <net/gre.h> #include <net/geneve.h> #include <net/ip.h> #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { const struct nft_expr_ops *ops; union { struct nft_payload payload; struct nft_meta meta; } __attribute__((aligned(__alignof__(u64)))); }; enum { NFT_INNER_EXPR_PAYLOAD, NFT_INNER_EXPR_META, }; struct nft_inner { u8 flags; u8 hdrsize; u8 type; u8 expr_type; struct __nft_expr expr; }; static int nft_inner_parse_l2l3(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *ctx, u32 off) { __be16 llproto, outer_llproto; u32 nhoff, thoff; if (priv->flags & NFT_INNER_LL) { struct vlan_ethhdr *veth, _veth; struct ethhdr *eth, _eth; u32 hdrsize; eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); if (!eth) return -1; switch (eth->h_proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): llproto = eth->h_proto; hdrsize = sizeof(_eth); break; case htons(ETH_P_8021Q): veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); if (!veth) return -1; outer_llproto = veth->h_vlan_encapsulated_proto; llproto = veth->h_vlan_proto; hdrsize = sizeof(_veth); break; default: return -1; } ctx->inner_lloff = off; ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; off += hdrsize; } else { struct iphdr *iph; u32 _version; iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); if (!iph) return -1; switch (iph->version) { case 4: llproto = htons(ETH_P_IP); break; case 6: llproto = htons(ETH_P_IPV6); break; default: return -1; } } ctx->llproto = llproto; if (llproto == htons(ETH_P_8021Q)) llproto = outer_llproto; nhoff = off; switch (llproto) { case htons(ETH_P_IP): { struct iphdr *iph, _iph; iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; ctx->inner_nhoff = nhoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; thoff = nhoff + (iph->ihl * 4); if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = iph->protocol; } } break; case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h, _ip6h; int fh_flags = IP6_FH_F_AUTH; unsigned short fragoff; int l4proto; ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; ctx->inner_nhoff = nhoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; thoff = nhoff; l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); if (l4proto < 0 || thoff > U16_MAX) return -1; if (fragoff == 0) { thoff = nhoff + sizeof(_ip6h); ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = l4proto; } } break; default: return -1; } return 0; } static int nft_inner_parse_tunhdr(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *ctx, u32 *off) { if (pkt->tprot == IPPROTO_GRE) { ctx->inner_tunoff = pkt->thoff; ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; return 0; } if (pkt->tprot != IPPROTO_UDP) return -1; ctx->inner_tunoff = *off; ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; *off += priv->hdrsize; switch (priv->type) { case NFT_INNER_GENEVE: { struct genevehdr *gnvh, _gnvh; gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, sizeof(_gnvh), &_gnvh); if (!gnvh) return -1; *off += gnvh->opt_len * 4; } break; default: break; } return 0; } static int nft_inner_parse(const struct nft_inner *priv, struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { u32 off = pkt->inneroff; if (priv->flags & NFT_INNER_HDRSIZE && nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) return -1; if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) return -1; } else if (priv->flags & NFT_INNER_TH) { tun_ctx->inner_thoff = off; tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; } tun_ctx->type = priv->type; tun_ctx->cookie = (unsigned long)pkt->skb; pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); return false; } *tun_ctx = *this_cpu_tun_ctx; local_bh_enable(); return true; } static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, const struct nft_inner_tun_ctx *tun_ctx) { struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; local_bh_enable(); } static bool nft_inner_parse_needed(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) return true; if (!nft_inner_restore_tun_ctx(pkt, tun_ctx)) return true; if (priv->type != tun_ctx->type) return true; return false; } static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_inner *priv = nft_expr_priv(expr); struct nft_inner_tun_ctx tun_ctx = {}; if (nft_payload_inner_offset(pkt) < 0) goto err; if (nft_inner_parse_needed(priv, pkt, &tun_ctx) && nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0) goto err; switch (priv->expr_type) { case NFT_INNER_EXPR_PAYLOAD: nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; case NFT_INNER_EXPR_META: nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); break; default: WARN_ON_ONCE(1); goto err; } nft_inner_save_tun_ctx(pkt, &tun_ctx); return; err: regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { [NFTA_INNER_NUM] = { .type = NLA_U32 }, [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, [NFTA_INNER_TYPE] = { .type = NLA_U32 }, [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, }; struct nft_expr_info { const struct nft_expr_ops *ops; const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; static int nft_inner_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_inner *priv = nft_expr_priv(expr); u32 flags, hdrsize, type, num; struct nft_expr_info expr_info; int err; if (!tb[NFTA_INNER_FLAGS] || !tb[NFTA_INNER_NUM] || !tb[NFTA_INNER_HDRSIZE] || !tb[NFTA_INNER_TYPE] || !tb[NFTA_INNER_EXPR]) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); if (flags & ~NFT_INNER_MASK) return -EOPNOTSUPP; num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); if (num != 0) return -EOPNOTSUPP; hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); if (type > U8_MAX) return -EINVAL; if (flags & NFT_INNER_HDRSIZE) { if (hdrsize == 0 || hdrsize > 64) return -EOPNOTSUPP; } priv->flags = flags; priv->hdrsize = hdrsize; priv->type = type; err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); if (err < 0) return err; priv->expr.ops = expr_info.ops; if (!strcmp(expr_info.ops->type->name, "payload")) priv->expr_type = NFT_INNER_EXPR_PAYLOAD; else if (!strcmp(expr_info.ops->type->name, "meta")) priv->expr_type = NFT_INNER_EXPR_META; else return -EINVAL; err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, (const struct nlattr * const*)expr_info.tb); if (err < 0) return err; return 0; } static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_inner *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) goto nla_put_failure; if (nft_expr_dump(skb, NFTA_INNER_EXPR, (struct nft_expr *)&priv->expr, reset) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nft_expr_ops nft_inner_ops = { .type = &nft_inner_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), .eval = nft_inner_eval, .init = nft_inner_init, .dump = nft_inner_dump, }; struct nft_expr_type nft_inner_type __read_mostly = { .name = "inner", .ops = &nft_inner_ops, .policy = nft_inner_policy, .maxattr = NFTA_INNER_MAX, .owner = THIS_MODULE, };
376 534 5 2920 2606 4227 7619 1903 1902 894 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/writeback.h */ #ifndef WRITEBACK_H #define WRITEBACK_H #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> #include <linux/pagevec.h> struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 struct backing_dev_info; /* * fs/fs-writeback.c */ enum writeback_sync_modes { WB_SYNC_NONE, /* Don't wait on anything */ WB_SYNC_ALL, /* Wait on every mapping */ }; /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised * in a manner such that unspecified fields are set to zero. */ struct writeback_control { /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ /* * For a_ops->writepages(): if start or end are non-zero then this is * a hint that the filesystem need only write out the pages inside that * byterange. The byte at `end' is included in the writeout request. */ loff_t range_start; loff_t range_end; enum writeback_sync_modes sync_mode; unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned unpinned_netfs_wb:1; /* Cleared I_PINNING_NETFS_WB */ /* * When writeback IOs are bounced through async layers, only the * initial synchronous phase should be accounted towards inode * cgroup ownership arbitration to avoid confusion. Later stages * can set the following flag to disable the accounting. */ unsigned no_cgroup_owner:1; /* To enable batching of swap writes to non-block-device backends, * "plug" can be set point to a 'struct swap_iocb *'. When all swap * writes have been submitted, if with swap_iocb is not NULL, * swap_write_unplug() should be called. */ struct swap_iocb **swap_plug; /* Target list for splitting a large folio */ struct list_head *list; /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; int saved_err; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ /* foreign inode detection, see wbc_detach_inode() */ int wb_id; /* current wb id */ int wb_lcand_id; /* last foreign candidate wb id */ int wb_tcand_id; /* this foreign candidate wb id */ size_t wb_bytes; /* bytes written by current wb */ size_t wb_lcand_bytes; /* bytes written by last candidate */ size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc) { blk_opf_t flags = 0; if (wbc->sync_mode == WB_SYNC_ALL) flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) flags |= REQ_BACKGROUND; return flags; } #ifdef CONFIG_CGROUP_WRITEBACK #define wbc_blkcg_css(wbc) \ ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css) #else #define wbc_blkcg_css(wbc) (blkcg_root_css) #endif /* CONFIG_CGROUP_WRITEBACK */ /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global * domain, global_wb_domain, that every wb in the system is a member of. * This allows measuring the relative bandwidth of each wb to distribute * dirtyable memory accordingly. */ struct wb_domain { spinlock_t lock; /* * Scale the writeback cache size proportional to the relative * writeout speed. * * We do this by keeping a floating proportion between BDIs, based * on page writeback completions [end_page_writeback()]. Those * devices that write out pages fastest will get the larger share, * while the slower will get a smaller share. * * We use page writeout completions because we are interested in * getting rid of dirty pages. Having them written out is the * primary goal. * * We introduce a concept of time, a period over which we measure * these events, because demand can/will vary over time. The length * of this period itself is measured in page writeback completions. */ struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; /* * The dirtyable memory and dirty threshold could be suddenly * knocked down by a large amount (eg. on the startup of KVM in a * swapless system). This may throw the system into deep dirty * exceeded state and throttle heavy/light dirtiers alike. To * retain good responsiveness, maintain global_dirty_limit for * tracking slowly down to the knocked down dirty threshold. * * Both fields are protected by ->lock. */ unsigned long dirty_limit_tstamp; unsigned long dirty_limit; }; /** * wb_domain_size_changed - memory available to a wb_domain has changed * @dom: wb_domain of interest * * This function should be called when the amount of memory available to * @dom has changed. It resets @dom's dirty limit parameters to prevent * the past values which don't match the current configuration from skewing * dirty throttling. Without this, when memory size of a wb_domain is * greatly reduced, the dirty throttling logic may allow too many pages to * be dirtied leading to consecutive unnecessary OOMs and may get stuck in * that situation. */ static inline void wb_domain_size_changed(struct wb_domain *dom) { spin_lock(&dom->lock); dom->dirty_limit_tstamp = jiffies; dom->dirty_limit = 0; spin_unlock(&dom->lock); } /* * fs/fs-writeback.c */ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), !(READ_ONCE(inode->i_state) & I_NEW)); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/cgroup.h> #include <linux/bio.h> void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(struct super_block *sb); bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) __inode_attach_wb(inode, folio); } /** * inode_detach_wb - disassociate an inode from its wb * @inode: inode of interest * * @inode is being freed. Detach from its wb. */ static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } } void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio * @wbc: writeback_control for the writeback in progress * @bio: bio to be initialized * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup * writeback context. Must be called after the bio has been associated with * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (wbc->wb) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } static inline void inode_detach_wb(struct inode *inode) { } static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { } static inline void wbc_detach_inode(struct writeback_control *wbc) { } static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { } static inline void cgroup_writeback_umount(struct super_block *sb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * mm/page-writeback.c */ /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long limit; /* hard dirty limit */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; bool freerun; bool dirty_exceeded; }; void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom); #endif extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long cgwb_calc_thresh(struct bdi_writeback *wb); void wb_update_bandwidth(struct bdi_writeback *wb); /* Invoke balance dirty pages in async mode. */ #define BDP_ASYNC 0x0001 void balance_dirty_pages_ratelimited(struct address_space *mapping); int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags); bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); #endif /* WRITEBACK_H */
53 40 159 79 17 17 214 42 535 435 41 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash: Hash algorithms under the crypto API * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_HASH_H #define _CRYPTO_HASH_H #include <linux/atomic.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/string.h> /* Set this bit for virtual address instead of SG list. */ #define CRYPTO_AHASH_REQ_VIRT 0x00000001 struct crypto_ahash; /** * DOC: Message Digest Algorithm Definitions * * These data structures define modular message digest algorithm * implementations, managed via crypto_register_ahash(), * crypto_register_shash(), crypto_unregister_ahash() and * crypto_unregister_shash(). */ /* * struct hash_alg_common - define properties of message digest * @digestsize: Size of the result of the transformation. A buffer of this size * must be available to the @final and @finup calls, so they can * store the resulting hash into it. For various predefined sizes, * search include/crypto/ using * git grep _DIGEST_SIZE include/crypto. * @statesize: Size of the block for partial state of the transformation. A * buffer of this size must be passed to the @export function as it * will save the partial state of the transformation into it. On the * other side, the @import function will load the state from a * buffer of this size as well. * @base: Start of data structure of cipher algorithm. The common data * structure of crypto_alg contains information common to all ciphers. * The hash_alg_common data structure now adds the hash-specific * information. */ #define HASH_ALG_COMMON { \ unsigned int digestsize; \ unsigned int statesize; \ \ struct crypto_alg base; \ } struct hash_alg_common HASH_ALG_COMMON; struct ahash_request { struct crypto_async_request base; unsigned int nbytes; union { struct scatterlist *src; const u8 *svirt; }; u8 *result; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct ahash_alg - asynchronous message digest definition * @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the * state of the HASH transformation at the beginning. This shall fill in * the internal structures used during the entire duration of the whole * transformation. No data processing happens at this point. Driver code * implementation must not use req->result. * @update: **[mandatory]** Push a chunk of data into the driver for transformation. This * function actually pushes blocks of data from upper layers into the * driver, which then passes those to the hardware as seen fit. This * function must not finalize the HASH transformation by calculating the * final message digest as this only adds more data into the * transformation. This function shall not modify the transformation * context, as this function may be called in parallel with the same * transformation object. Data processing can happen synchronously * [SHASH] or asynchronously [AHASH] at this point. Driver must not use * req->result. * @final: **[mandatory]** Retrieve result from the driver. This function finalizes the * transformation and retrieves the resulting hash from the driver and * pushes it back to upper layers. No data processing happens at this * point unless hardware requires it to finish the transformation * (then the data buffered by the device driver is processed). * @finup: **[optional]** Combination of @update and @final. This function is effectively a * combination of @update and @final calls issued in sequence. As some * hardware cannot do @update and @final separately, this callback was * added to allow such hardware to be used at least by IPsec. Data * processing can happen synchronously [SHASH] or asynchronously [AHASH] * at this point. * @digest: Combination of @init and @update and @final. This function * effectively behaves as the entire chain of operations, @init, * @update and @final issued in sequence. Just like @finup, this was * added for hardware which cannot do even the @finup, but can only do * the whole transformation in one run. Data processing can happen * synchronously [SHASH] or asynchronously [AHASH] at this point. * @setkey: Set optional key used by the hashing algorithm. Intended to push * optional key used by the hashing algorithm from upper layers into * the driver. This function can store the key in the transformation * context or can outright program it into the hardware. In the former * case, one must be careful to program the key into the hardware at * appropriate time and one must be careful that .setkey() can be * called multiple times during the existence of the transformation * object. Not all hashing algorithms do implement this function as it * is only needed for keyed message digests. SHAx/MDx/CRCx do NOT * implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement * this function. This function must be called before any other of the * @init, @update, @final, @finup, @digest is called. No data * processing happens at this point. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. Driver must not use req->result. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. Driver must not use * req->result. * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @clone_tfm: Copy transform into new object, may allocate memory. * @reqsize: Size of the request context. * @halg: see struct hash_alg_common */ struct ahash_alg { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_ahash *tfm); void (*exit_tfm)(struct crypto_ahash *tfm); int (*clone_tfm)(struct crypto_ahash *dst, struct crypto_ahash *src); unsigned int reqsize; struct hash_alg_common halg; }; struct shash_desc { struct crypto_shash *tfm; void *__ctx[] __aligned(ARCH_SLAB_MINALIGN); }; #define HASH_MAX_DIGESTSIZE 64 /* * Worst case is hmac(sha3-224-generic). Its context is a nested 'shash_desc' * containing a 'struct sha3_state'. */ #define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + 360) #define SHASH_DESC_ON_STACK(shash, ctx) \ char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \ __aligned(__alignof__(struct shash_desc)); \ struct shash_desc *shash = (struct shash_desc *)__##shash##_desc /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg * @update: see struct ahash_alg * @final: see struct ahash_alg * @finup: see struct ahash_alg * @digest: see struct ahash_alg * @export: see struct ahash_alg * @import: see struct ahash_alg * @setkey: see struct ahash_alg * @init_tfm: Initialize the cryptographic transformation object. * This function is called only once at the instantiation * time, right after the transformation context was * allocated. In case the cryptographic hardware has * some special requirements which need to be handled * by software, this function shall check for the precise * requirement of the transformation and put any software * fallbacks in place. * @exit_tfm: Deinitialize the cryptographic transformation object. * This is a counterpart to @init_tfm, used to remove * various changes set in @init_tfm. * @clone_tfm: Copy transform into new object, may allocate memory. * @descsize: Size of the operational state for the message digest. This state * size is the memory size that needs to be allocated for * shash_desc.__ctx * @halg: see struct hash_alg_common * @HASH_ALG_COMMON: see struct hash_alg_common */ struct shash_alg { int (*init)(struct shash_desc *desc); int (*update)(struct shash_desc *desc, const u8 *data, unsigned int len); int (*final)(struct shash_desc *desc, u8 *out); int (*finup)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*digest)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*export)(struct shash_desc *desc, void *out); int (*import)(struct shash_desc *desc, const void *in); int (*setkey)(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); int (*init_tfm)(struct crypto_shash *tfm); void (*exit_tfm)(struct crypto_shash *tfm); int (*clone_tfm)(struct crypto_shash *dst, struct crypto_shash *src); unsigned int descsize; union { struct HASH_ALG_COMMON; struct hash_alg_common halg; }; }; #undef HASH_ALG_COMMON struct crypto_ahash { bool using_shash; /* Underlying algorithm is shash, not ahash */ unsigned int statesize; unsigned int reqsize; struct crypto_tfm base; }; struct crypto_shash { unsigned int descsize; struct crypto_tfm base; }; /** * DOC: Asynchronous Message Digest API * * The asynchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto) * * The asynchronous cipher operation discussion provided for the * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well. */ static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_ahash, base); } /** * crypto_alloc_ahash() - allocate ahash cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an ahash. The returned struct * crypto_ahash is the cipher handle that is required for any subsequent * API invocation for that ahash. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask); struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *tfm); static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) { return &tfm->base; } /** * crypto_free_ahash() - zeroize and free the ahash handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_ahash(struct crypto_ahash *tfm) { crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm)); } /** * crypto_has_ahash() - Search for the availability of an ahash. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash * @type: specifies the type of the ahash * @mask: specifies the mask for the ahash * * Return: true when the ahash is known to the kernel crypto API; false * otherwise */ int crypto_has_ahash(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_name(crypto_ahash_tfm(tfm)); } static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm) { return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm)); } /** * crypto_ahash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm) { return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); } static inline struct hash_alg_common *__crypto_hash_alg_common( struct crypto_alg *alg) { return container_of(alg, struct hash_alg_common, base); } static inline struct hash_alg_common *crypto_hash_alg_common( struct crypto_ahash *tfm) { return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg); } /** * crypto_ahash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * * Return: message digest size of cipher */ static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->digestsize; } /** * crypto_ahash_statesize() - obtain size of the ahash state * @tfm: cipher handle * * Return the size of the ahash state. With the crypto_ahash_export() * function, the caller can export the state into a buffer whose size is * defined with this function. * * Return: size of the ahash state */ static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm) { return tfm->statesize; } static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) { return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); } static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); } static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); } /** * crypto_ahash_reqtfm() - obtain cipher handle from request * @req: asynchronous request handle that contains the reference to the ahash * cipher handle * * Return the ahash cipher handle that is registered with the asynchronous * request handle ahash_request. * * Return: ahash cipher handle */ static inline struct crypto_ahash *crypto_ahash_reqtfm( struct ahash_request *req) { return __crypto_ahash_cast(req->base.tfm); } /** * crypto_ahash_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: size of the request data */ static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) { return tfm->reqsize; } static inline void *ahash_request_ctx(struct ahash_request *req) { return req->__ctx; } /** * crypto_ahash_setkey - set key for cipher handle * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the ahash cipher. The cipher * handle must point to a keyed hash in order for this function to succeed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_finup(struct ahash_request *req); /** * crypto_ahash_final() - calculate message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer registered with the ahash_request handle. * * Return: * 0 if the message digest was successfully calculated; * -EINPROGRESS if data is fed into hardware (DMA) or queued for later; * -EBUSY if queue is full and request should be resubmitted later; * other < 0 if an error occurred */ int crypto_ahash_final(struct ahash_request *req); /** * crypto_ahash_digest() - calculate message digest for a buffer * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of crypto_ahash_init, * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Return: see crypto_ahash_final() */ int crypto_ahash_digest(struct ahash_request *req); /** * crypto_ahash_export() - extract current message digest state * @req: reference to the ahash_request handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the ahash_request handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_ahash_statesize()). * * Return: 0 if the export was successful; < 0 if an error occurred */ int crypto_ahash_export(struct ahash_request *req, void *out); /** * crypto_ahash_import() - import message digest state * @req: reference to ahash_request handle the state is imported into * @in: buffer holding the state * * This function imports the hash state into the ahash_request handle from the * input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_ahash_import(struct ahash_request *req, const void *in); /** * crypto_ahash_init() - (re)initialize message digest handle * @req: ahash_request handle that already is initialized with all necessary * data using the ahash_request_* API functions * * The call (re-)initializes the message digest referenced by the ahash_request * handle. Any potentially existing state created by previous operations is * discarded. * * Return: see crypto_ahash_final() */ int crypto_ahash_init(struct ahash_request *req); /** * crypto_ahash_update() - add data to message digest for processing * @req: ahash_request handle that was previously initialized with the * crypto_ahash_init call. * * Updates the message digest state of the &ahash_request handle. The input data * is pointed to by the scatter/gather list registered in the &ahash_request * handle * * Return: see crypto_ahash_final() */ int crypto_ahash_update(struct ahash_request *req); /** * DOC: Asynchronous Hash Request Handle * * The &ahash_request data structure contains all pointers to data * required for the asynchronous cipher operation. This includes the cipher * handle (which can be used by multiple &ahash_request instances), pointer * to plaintext and the message digest output buffer, asynchronous callback * function, etc. It acts as a handle to the ahash_request_* API calls in a * similar way as ahash handle to the crypto_ahash_* API calls. */ /** * ahash_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing ahash handle in the request * data structure with a different one. */ static inline void ahash_request_set_tfm(struct ahash_request *req, struct crypto_ahash *tfm) { req->base.tfm = crypto_ahash_tfm(tfm); } /** * ahash_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the ahash * message digest API calls. During * the allocation, the provided ahash handle * is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct ahash_request *ahash_request_alloc_noprof( struct crypto_ahash *tfm, gfp_t gfp) { struct ahash_request *req; req = kmalloc_noprof(sizeof(struct ahash_request) + crypto_ahash_reqsize(tfm), gfp); if (likely(req)) ahash_request_set_tfm(req, tfm); return req; } #define ahash_request_alloc(...) alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__)) /** * ahash_request_free() - zeroize and free the request data structure * @req: request data structure cipher handle to be freed */ static inline void ahash_request_free(struct ahash_request *req) { kfree_sensitive(req); } static inline struct ahash_request *ahash_request_cast( struct crypto_async_request *req) { return container_of(req, struct ahash_request, base); } /** * ahash_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * &crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once * the cipher operation completes. * * The callback function is registered with the &ahash_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void ahash_request_set_callback(struct ahash_request *req, u32 flags, crypto_completion_t compl, void *data) { u32 keep = CRYPTO_AHASH_REQ_VIRT; req->base.complete = compl; req->base.data = data; flags &= ~keep; req->base.flags &= keep; req->base.flags |= flags; crypto_reqchain_init(&req->base); } /** * ahash_request_set_crypt() - set data buffers * @req: ahash_request handle to be updated * @src: source scatter/gather list * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source scatter/gather list * * By using this call, the caller references the source scatter/gather list. * The source scatter/gather list points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_crypt(struct ahash_request *req, struct scatterlist *src, u8 *result, unsigned int nbytes) { req->src = src; req->nbytes = nbytes; req->result = result; req->base.flags &= ~CRYPTO_AHASH_REQ_VIRT; } /** * ahash_request_set_virt() - set virtual address data buffers * @req: ahash_request handle to be updated * @src: source virtual address * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source virtual address * * By using this call, the caller references the source virtual address. * The source virtual address points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_virt(struct ahash_request *req, const u8 *src, u8 *result, unsigned int nbytes) { req->svirt = src; req->nbytes = nbytes; req->result = result; req->base.flags |= CRYPTO_AHASH_REQ_VIRT; } static inline void ahash_request_chain(struct ahash_request *req, struct ahash_request *head) { crypto_request_chain(&req->base, &head->base); } /** * DOC: Synchronous Message Digest API * * The synchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto) * * The message digest API is able to maintain state information for the * caller. * * The synchronous message digest API can store user-related context in its * shash_desc request data structure. */ /** * crypto_alloc_shash() - allocate message digest handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a message digest. The returned &struct * crypto_shash is the cipher handle that is required for any subsequent * API invocation for that message digest. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask); struct crypto_shash *crypto_clone_shash(struct crypto_shash *tfm); int crypto_has_shash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) { return &tfm->base; } /** * crypto_free_shash() - zeroize and free the message digest handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_shash(struct crypto_shash *tfm) { crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm) { return crypto_tfm_alg_name(crypto_shash_tfm(tfm)); } static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm) { return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); } /** * crypto_shash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm) { return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm)); } static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg) { return container_of(alg, struct shash_alg, base); } static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm) { return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg); } /** * crypto_shash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * Return: digest size of cipher */ static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->digestsize; } static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->statesize; } static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm) { return crypto_tfm_get_flags(crypto_shash_tfm(tfm)); } static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags); } static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags); } /** * crypto_shash_descsize() - obtain the operational state size * @tfm: cipher handle * * The size of the operational state the cipher needs during operation is * returned for the hash referenced with the cipher handle. This size is * required to calculate the memory requirements to allow the caller allocating * sufficient memory for operational state. * * The operational state is defined with struct shash_desc where the size of * that data structure is to be calculated as * sizeof(struct shash_desc) + crypto_shash_descsize(alg) * * Return: size of the operational state */ static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm) { return tfm->descsize; } static inline void *shash_desc_ctx(struct shash_desc *desc) { return desc->__ctx; } /** * crypto_shash_setkey() - set key for message digest * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the keyed message digest cipher. The * cipher handle must point to a keyed message digest cipher in order for this * function to succeed. * * Context: Any context. * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); /** * crypto_shash_digest() - calculate message digest for buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of crypto_shash_init, * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_tfm_digest() - calculate message digest for buffer * @tfm: hash transformation object * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This is a simplified version of crypto_shash_digest() for users who don't * want to allocate their own hash descriptor (shash_desc). Instead, * crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash) * directly, and it allocates a hash descriptor on the stack internally. * Note that this stack allocation may be fairly large. * * Context: Any context. * Return: 0 on success; < 0 if an error occurred. */ int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_export() - extract operational state for message digest * @desc: reference to the operational state handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the operational state handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_shash_descsize). * * Context: Any context. * Return: 0 if the export creation was successful; < 0 if an error occurred */ int crypto_shash_export(struct shash_desc *desc, void *out); /** * crypto_shash_import() - import operational state * @desc: reference to the operational state handle the state imported into * @in: buffer holding the state * * This function imports the hash state into the operational state handle from * the input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Context: Any context. * Return: 0 if the import was successful; < 0 if an error occurred */ int crypto_shash_import(struct shash_desc *desc, const void *in); /** * crypto_shash_init() - (re)initialize message digest * @desc: operational state handle that is already filled * * The call (re-)initializes the message digest referenced by the * operational state handle. Any potentially existing state created by * previous operations is discarded. * * Context: Any context. * Return: 0 if the message digest initialization was successful; < 0 if an * error occurred */ static inline int crypto_shash_init(struct shash_desc *desc) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_alg(tfm)->init(desc); } /** * crypto_shash_update() - add data to message digest for processing * @desc: operational state handle that is already initialized * @data: input data to be added to the message digest * @len: length of the input data * * Updates the message digest state of the operational state handle. * * Context: Any context. * Return: 0 if the message digest update was successful; < 0 if an error * occurred */ int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len); /** * crypto_shash_final() - calculate message digest * @desc: operational state handle that is already filled with data * @out: output buffer filled with the message digest * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer. The caller must ensure that the output buffer is * large enough by using crypto_shash_digestsize. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_final(struct shash_desc *desc, u8 *out); /** * crypto_shash_finup() - calculate message digest of buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate functions. * * Context: Any context. * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); static inline void shash_desc_zero(struct shash_desc *desc) { memzero_explicit(desc, sizeof(*desc) + crypto_shash_descsize(desc->tfm)); } static inline int ahash_request_err(struct ahash_request *req) { return req->base.err; } static inline bool ahash_is_async(struct crypto_ahash *tfm) { return crypto_tfm_is_async(&tfm->base); } #endif /* _CRYPTO_HASH_H */
93 93 93 93 93 93 299 299 298 140 140 141 93 93 93 288 297 288 55 279 279 1 1 1 1 1 9 176 9 169 55 55 55 133 238 133 171 434 282 240 238 418 418 226 342 152 418 170 170 29 122 55 169 171 171 24 102 107 171 151 151 25 151 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/switchdev/switchdev.c - Switch device API * Copyright (c) 2014-2015 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com> */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/list.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> static bool switchdev_obj_eq(const struct switchdev_obj *a, const struct switchdev_obj *b) { const struct switchdev_obj_port_vlan *va, *vb; const struct switchdev_obj_port_mdb *ma, *mb; if (a->id != b->id || a->orig_dev != b->orig_dev) return false; switch (a->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: va = SWITCHDEV_OBJ_PORT_VLAN(a); vb = SWITCHDEV_OBJ_PORT_VLAN(b); return va->flags == vb->flags && va->vid == vb->vid && va->changed == vb->changed; case SWITCHDEV_OBJ_ID_PORT_MDB: case SWITCHDEV_OBJ_ID_HOST_MDB: ma = SWITCHDEV_OBJ_PORT_MDB(a); mb = SWITCHDEV_OBJ_PORT_MDB(b); return ma->vid == mb->vid && ether_addr_equal(ma->addr, mb->addr); default: break; } BUG(); } static LIST_HEAD(deferred); static DEFINE_SPINLOCK(deferred_lock); typedef void switchdev_deferred_func_t(struct net_device *dev, const void *data); struct switchdev_deferred_item { struct list_head list; struct net_device *dev; netdevice_tracker dev_tracker; switchdev_deferred_func_t *func; unsigned long data[]; }; static struct switchdev_deferred_item *switchdev_deferred_dequeue(void) { struct switchdev_deferred_item *dfitem; spin_lock_bh(&deferred_lock); if (list_empty(&deferred)) { dfitem = NULL; goto unlock; } dfitem = list_first_entry(&deferred, struct switchdev_deferred_item, list); list_del(&dfitem->list); unlock: spin_unlock_bh(&deferred_lock); return dfitem; } /** * switchdev_deferred_process - Process ops in deferred queue * * Called to flush the ops currently queued in deferred ops queue. * rtnl_lock must be held. */ void switchdev_deferred_process(void) { struct switchdev_deferred_item *dfitem; ASSERT_RTNL(); while ((dfitem = switchdev_deferred_dequeue())) { dfitem->func(dfitem->dev, dfitem->data); netdev_put(dfitem->dev, &dfitem->dev_tracker); kfree(dfitem); } } EXPORT_SYMBOL_GPL(switchdev_deferred_process); static void switchdev_deferred_process_work(struct work_struct *work) { rtnl_lock(); switchdev_deferred_process(); rtnl_unlock(); } static DECLARE_WORK(deferred_process_work, switchdev_deferred_process_work); static int switchdev_deferred_enqueue(struct net_device *dev, const void *data, size_t data_len, switchdev_deferred_func_t *func) { struct switchdev_deferred_item *dfitem; dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC); if (!dfitem) return -ENOMEM; dfitem->dev = dev; dfitem->func = func; memcpy(dfitem->data, data, data_len); netdev_hold(dev, &dfitem->dev_tracker, GFP_ATOMIC); spin_lock_bh(&deferred_lock); list_add_tail(&dfitem->list, &deferred); spin_unlock_bh(&deferred_lock); schedule_work(&deferred_process_work); return 0; } static int switchdev_port_attr_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { int err; int rc; struct switchdev_notifier_port_attr_info attr_info = { .attr = attr, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &attr_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!attr_info.handled); return err; } if (!attr_info.handled) return -EOPNOTSUPP; return 0; } static int switchdev_port_attr_set_now(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { return switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, extack); } static void switchdev_port_attr_set_deferred(struct net_device *dev, const void *data) { const struct switchdev_attr *attr = data; int err; err = switchdev_port_attr_set_now(dev, attr, NULL); if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); if (attr->complete) attr->complete(dev, err, attr->complete_priv); } static int switchdev_port_attr_set_defer(struct net_device *dev, const struct switchdev_attr *attr) { return switchdev_deferred_enqueue(dev, attr, sizeof(*attr), switchdev_port_attr_set_deferred); } /** * switchdev_port_attr_set - Set port attribute * * @dev: port device * @attr: attribute to set * @extack: netlink extended ack, for error message propagation * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { if (attr->flags & SWITCHDEV_F_DEFER) return switchdev_port_attr_set_defer(dev, attr); ASSERT_RTNL(); return switchdev_port_attr_set_now(dev, attr, extack); } EXPORT_SYMBOL_GPL(switchdev_port_attr_set); static size_t switchdev_obj_size(const struct switchdev_obj *obj) { switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); case SWITCHDEV_OBJ_ID_PORT_MDB: return sizeof(struct switchdev_obj_port_mdb); case SWITCHDEV_OBJ_ID_HOST_MDB: return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } return 0; } static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { int rc; int err; struct switchdev_notifier_port_obj_info obj_info = { .obj = obj, .handled = false, }; rc = call_switchdev_blocking_notifiers(nt, dev, &obj_info.info, extack); err = notifier_to_errno(rc); if (err) { WARN_ON(!obj_info.handled); return err; } if (!obj_info.handled) return -EOPNOTSUPP; return 0; } static void switchdev_obj_id_to_helpful_msg(struct net_device *dev, enum switchdev_obj_id obj_id, int err, bool add) { const char *action = add ? "add" : "del"; const char *reason = ""; const char *problem; const char *obj_str; switch (obj_id) { case SWITCHDEV_OBJ_ID_UNDEFINED: obj_str = "Undefined object"; problem = "Attempted operation is undefined, indicating a possible programming\n" "error.\n"; break; case SWITCHDEV_OBJ_ID_PORT_VLAN: obj_str = "VLAN entry"; problem = "Failure in VLAN settings on this port might disrupt network\n" "segmentation or traffic isolation, affecting network partitioning.\n"; break; case SWITCHDEV_OBJ_ID_PORT_MDB: obj_str = "Port Multicast Database entry"; problem = "Failure in updating the port's Multicast Database could lead to\n" "multicast forwarding issues.\n"; break; case SWITCHDEV_OBJ_ID_HOST_MDB: obj_str = "Host Multicast Database entry"; problem = "Failure in updating the host's Multicast Database may impact multicast\n" "group memberships or traffic delivery, affecting multicast\n" "communication.\n"; break; case SWITCHDEV_OBJ_ID_MRP: obj_str = "Media Redundancy Protocol configuration for port"; problem = "Failure to set MRP ring ID on this port prevents communication with\n" "the specified redundancy ring, resulting in an inability to engage\n" "in MRP-based network operations.\n"; break; case SWITCHDEV_OBJ_ID_RING_TEST_MRP: obj_str = "MRP Test Frame Operations for port"; problem = "Failure to generate/monitor MRP test frames may lead to inability to\n" "assess the ring's operational integrity and fault response, hindering\n" "proactive network management.\n"; break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: obj_str = "MRP Ring Role Configuration"; problem = "Improper MRP ring role configuration may create conflicts in the ring,\n" "disrupting communication for all participants, or isolate the local\n" "system from the ring, hindering its ability to communicate with other\n" "participants.\n"; break; case SWITCHDEV_OBJ_ID_RING_STATE_MRP: obj_str = "MRP Ring State Configuration"; problem = "Failure to correctly set the MRP ring state can result in network\n" "loops or leave segments without communication. In a Closed state,\n" "it maintains loop prevention by blocking one MRM port, while an Open\n" "state activates in response to failures, changing port states to\n" "preserve network connectivity.\n"; break; case SWITCHDEV_OBJ_ID_IN_TEST_MRP: obj_str = "MRP_InTest Frame Generation Configuration"; problem = "Failure in managing MRP_InTest frame generation can misjudge the\n" "interconnection ring's state, leading to incorrect blocking or\n" "unblocking of the I/C port. This misconfiguration might result\n" "in unintended network loops or isolate critical network segments,\n" "compromising network integrity and reliability.\n"; break; case SWITCHDEV_OBJ_ID_IN_ROLE_MRP: obj_str = "Interconnection Ring Role Configuration"; problem = "Failure in incorrect assignment of interconnection ring roles\n" "(MIM/MIC) can impair the formation of the interconnection rings.\n"; break; case SWITCHDEV_OBJ_ID_IN_STATE_MRP: obj_str = "Interconnection Ring State Configuration"; problem = "Failure in updating the interconnection ring state can lead in\n" "case of Open state to incorrect blocking or unblocking of the\n" "I/C port, resulting in unintended network loops or isolation\n" "of critical network\n"; break; default: obj_str = "Unknown object"; problem = "Indicating a possible programming error.\n"; } switch (err) { case -ENOSPC: reason = "Current HW/SW setup lacks sufficient resources.\n"; break; } netdev_err(dev, "Failed to %s %s (object id=%d) with error: %pe (%d).\n%s%s\n", action, obj_str, obj_id, ERR_PTR(err), err, problem, reason); } static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; ASSERT_RTNL(); err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, NULL); if (err && err != -EOPNOTSUPP) switchdev_obj_id_to_helpful_msg(dev, obj->id, err, true); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_add_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_add_deferred); } /** * switchdev_port_obj_add - Add port object * * @dev: port device * @obj: object to add * @extack: netlink extended ack * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_add_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, extack); } EXPORT_SYMBOL_GPL(switchdev_port_obj_add); static int switchdev_port_obj_del_now(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_DEL, dev, obj, NULL); } static void switchdev_port_obj_del_deferred(struct net_device *dev, const void *data) { const struct switchdev_obj *obj = data; int err; err = switchdev_port_obj_del_now(dev, obj); if (err && err != -EOPNOTSUPP) switchdev_obj_id_to_helpful_msg(dev, obj->id, err, false); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_del_defer(struct net_device *dev, const struct switchdev_obj *obj) { return switchdev_deferred_enqueue(dev, obj, switchdev_obj_size(obj), switchdev_port_obj_del_deferred); } /** * switchdev_port_obj_del - Delete port object * * @dev: port device * @obj: object to delete * * rtnl_lock must be held and must not be in atomic section, * in case SWITCHDEV_F_DEFER flag is not set. */ int switchdev_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { if (obj->flags & SWITCHDEV_F_DEFER) return switchdev_port_obj_del_defer(dev, obj); ASSERT_RTNL(); return switchdev_port_obj_del_now(dev, obj); } EXPORT_SYMBOL_GPL(switchdev_port_obj_del); /** * switchdev_port_obj_act_is_deferred - Is object action pending? * * @dev: port device * @nt: type of action; add or delete * @obj: object to test * * Returns true if a deferred item is pending, which is * equivalent to the action @nt on an object @obj. * * rtnl_lock must be held. */ bool switchdev_port_obj_act_is_deferred(struct net_device *dev, enum switchdev_notifier_type nt, const struct switchdev_obj *obj) { struct switchdev_deferred_item *dfitem; bool found = false; ASSERT_RTNL(); spin_lock_bh(&deferred_lock); list_for_each_entry(dfitem, &deferred, list) { if (dfitem->dev != dev) continue; if ((dfitem->func == switchdev_port_obj_add_deferred && nt == SWITCHDEV_PORT_OBJ_ADD) || (dfitem->func == switchdev_port_obj_del_deferred && nt == SWITCHDEV_PORT_OBJ_DEL)) { if (switchdev_obj_eq((const void *)dfitem->data, obj)) { found = true; break; } } } spin_unlock_bh(&deferred_lock); return found; } EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); static RAW_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. */ int register_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. */ int unregister_switchdev_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&switchdev_notif_chain, nb); } EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data * @extack: netlink extended ack * Call all network notifier blocks. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { info->dev = dev; info->extack = extack; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); int register_switchdev_blocking_notifier(struct notifier_block *nb) { struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; int err; rtnl_lock(); err = raw_notifier_chain_register(chain, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); int unregister_switchdev_blocking_notifier(struct notifier_block *nb) { struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; int err; rtnl_lock(); err = raw_notifier_chain_unregister(chain, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { ASSERT_RTNL(); info->dev = dev; info->extack = extack; return raw_notifier_call_chain(&switchdev_blocking_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); struct switchdev_nested_priv { bool (*check_cb)(const struct net_device *dev); bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); const struct net_device *dev; struct net_device *lower_dev; }; static int switchdev_lower_dev_walk(struct net_device *lower_dev, struct netdev_nested_priv *priv) { struct switchdev_nested_priv *switchdev_priv = priv->data; bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev); bool (*check_cb)(const struct net_device *dev); const struct net_device *dev; check_cb = switchdev_priv->check_cb; foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; dev = switchdev_priv->dev; if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { switchdev_priv->lower_dev = lower_dev; return 1; } return 0; } static struct net_device * switchdev_lower_dev_find_rcu(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev)) { struct switchdev_nested_priv switchdev_priv = { .check_cb = check_cb, .foreign_dev_check_cb = foreign_dev_check_cb, .dev = dev, .lower_dev = NULL, }; struct netdev_nested_priv priv = { .data = &switchdev_priv, }; netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } static struct net_device * switchdev_lower_dev_find(struct net_device *dev, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev)) { struct switchdev_nested_priv switchdev_priv = { .check_cb = check_cb, .foreign_dev_check_cb = foreign_dev_check_cb, .dev = dev, .lower_dev = NULL, }; struct netdev_nested_priv priv = { .data = &switchdev_priv, }; netdev_walk_all_lower_dev(dev, switchdev_lower_dev_walk, &priv); return switchdev_priv.lower_dev; } static int __switchdev_handle_fdb_event_to_device(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { const struct switchdev_notifier_info *info = &fdb_info->info; struct net_device *br, *lower_dev, *switchdev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) return mod_cb(dev, orig_dev, event, info->ctx, fdb_info); /* Recurse through lower interfaces in case the FDB entry is pointing * towards a bridge or a LAG device. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { /* Do not propagate FDB entries across bridges */ if (netif_is_bridge_master(lower_dev)) continue; /* Bridge ports might be either us, or LAG interfaces * that we offload. */ if (!check_cb(lower_dev) && !switchdev_lower_dev_find_rcu(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); if (err && err != -EOPNOTSUPP) return err; } /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ br = netdev_master_upper_dev_get_rcu(dev); if (!br || !netif_is_bridge_master(br)) return 0; switchdev = switchdev_lower_dev_find_rcu(br, check_cb, foreign_dev_check_cb); if (!switchdev) return 0; if (!foreign_dev_check_cb(switchdev, dev)) return err; return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); } int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info)) { int err; err = __switchdev_handle_fdb_event_to_device(dev, dev, event, fdb_info, check_cb, foreign_dev_check_cb, mod_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device); static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { struct switchdev_notifier_info *info = &port_obj_info->info; struct net_device *br, *lower_dev, *switchdev; struct netlink_ext_ack *extack; struct list_head *iter; int err = -EOPNOTSUPP; extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { err = add_cb(dev, info->ctx, port_obj_info->obj, extack); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; /* When searching for switchdev interfaces that are neighbors * of foreign ones, and @dev is a bridge, do not recurse on the * foreign interface again, it was already visited. */ if (foreign_dev_check_cb && !check_cb(lower_dev) && !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info, check_cb, foreign_dev_check_cb, add_cb); if (err && err != -EOPNOTSUPP) return err; } /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ if (!foreign_dev_check_cb) return err; br = netdev_master_upper_dev_get(dev); if (!br || !netif_is_bridge_master(br)) return err; switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb); if (!switchdev) return err; if (!foreign_dev_check_cb(switchdev, dev)) return err; return __switchdev_handle_port_obj_add(br, port_obj_info, check_cb, foreign_dev_check_cb, add_cb); } /* Pass through a port object addition, if @dev passes @check_cb, or replicate * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a * bridge or a LAG. */ int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { int err; err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, NULL, add_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); /* Same as switchdev_handle_port_obj_add(), except if object is notified on a * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices * that pass @check_cb and are in the same bridge as @dev. */ int switchdev_handle_port_obj_add_foreign(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { int err; err = __switchdev_handle_port_obj_add(dev, port_obj_info, check_cb, foreign_dev_check_cb, add_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add_foreign); static int __switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { struct switchdev_notifier_info *info = &port_obj_info->info; struct net_device *br, *lower_dev, *switchdev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) { err = del_cb(dev, info->ctx, port_obj_info->obj); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; /* When searching for switchdev interfaces that are neighbors * of foreign ones, and @dev is a bridge, do not recurse on the * foreign interface again, it was already visited. */ if (foreign_dev_check_cb && !check_cb(lower_dev) && !switchdev_lower_dev_find(lower_dev, check_cb, foreign_dev_check_cb)) continue; err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info, check_cb, foreign_dev_check_cb, del_cb); if (err && err != -EOPNOTSUPP) return err; } /* Event is neither on a bridge nor a LAG. Check whether it is on an * interface that is in a bridge with us. */ if (!foreign_dev_check_cb) return err; br = netdev_master_upper_dev_get(dev); if (!br || !netif_is_bridge_master(br)) return err; switchdev = switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb); if (!switchdev) return err; if (!foreign_dev_check_cb(switchdev, dev)) return err; return __switchdev_handle_port_obj_del(br, port_obj_info, check_cb, foreign_dev_check_cb, del_cb); } /* Pass through a port object deletion, if @dev passes @check_cb, or replicate * it towards all lower interfaces of @dev that pass @check_cb, if @dev is a * bridge or a LAG. */ int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { int err; err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, NULL, del_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); /* Same as switchdev_handle_port_obj_del(), except if object is notified on a * @dev that passes @foreign_dev_check_cb, it is replicated towards all devices * that pass @check_cb and are in the same bridge as @dev. */ int switchdev_handle_port_obj_del_foreign(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), bool (*foreign_dev_check_cb)(const struct net_device *dev, const struct net_device *foreign_dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { int err; err = __switchdev_handle_port_obj_del(dev, port_obj_info, check_cb, foreign_dev_check_cb, del_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del_foreign); static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { struct switchdev_notifier_info *info = &port_attr_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { err = set_cb(dev, info->ctx, port_attr_info->attr, extack); if (err != -EOPNOTSUPP) port_attr_info->handled = true; return err; } /* Switch ports might be stacked under e.g. a LAG. Ignore the * unsupported devices, another driver might be able to handle them. But * propagate to the callers any hard errors. * * If the driver does its own bookkeeping of stacked ports, it's not * necessary to go through this helper. */ netdev_for_each_lower_dev(dev, lower_dev, iter) { if (netif_is_bridge_master(lower_dev)) continue; err = __switchdev_handle_port_attr_set(lower_dev, port_attr_info, check_cb, set_cb); if (err && err != -EOPNOTSUPP) return err; } return err; } int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { int err; err = __switchdev_handle_port_attr_set(dev, port_attr_info, check_cb, set_cb); if (err == -EOPNOTSUPP) err = 0; return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set); int switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct switchdev_notifier_brport_info brport_info = { .brport = { .dev = dev, .ctx = ctx, .atomic_nb = atomic_nb, .blocking_nb = blocking_nb, .tx_fwd_offload = tx_fwd_offload, }, }; int err; ASSERT_RTNL(); err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED, brport_dev, &brport_info.info, extack); return notifier_to_errno(err); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload); void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { struct switchdev_notifier_brport_info brport_info = { .brport = { .ctx = ctx, .atomic_nb = atomic_nb, .blocking_nb = blocking_nb, }, }; ASSERT_RTNL(); call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED, brport_dev, &brport_info.info, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); int switchdev_bridge_port_replay(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { struct switchdev_notifier_brport_info brport_info = { .brport = { .dev = dev, .ctx = ctx, .atomic_nb = atomic_nb, .blocking_nb = blocking_nb, }, }; int err; ASSERT_RTNL(); err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY, brport_dev, &brport_info.info, extack); return notifier_to_errno(err); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay);
69 6 69 7 36 56 194 2 2 2 162 112 9 128 112 4 151 2 35 561 478 478 477 264 343 344 178 326 179 29 102 297 363 222 85 210 298 89 82 297 195 195 120 41 120 120 120 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/misc.c * * Written 1992,1993 by Werner Almesberger * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) */ #include "fat.h" #include <linux/iversion.h> /* * fat_fs_error reports a file system problem that might indicate fa data * corruption/inconsistency. Depending on 'errors' mount option the * panic() is called, or error message is printed FAT and nothing is done, * or filesystem is remounted read-only (default behavior). * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) { struct fat_mount_options *opts = &MSDOS_SB(sb)->options; va_list args; struct va_format vaf; if (report) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } if (opts->errors == FAT_ERRORS_PANIC) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) { sb->s_flags |= SB_RDONLY; fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); /** * _fat_msg() - Print a preformatted FAT message based on a superblock. * @sb: A pointer to a &struct super_block * @level: A Kernel printk level constant * @fmt: The printf-style format string to print. * * Everything that is not fat_fs_error() should be fat_msg(). * * fat_msg() wraps _fat_msg() for printk indexing. */ void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf); va_end(args); } /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ int fat_clusters_flush(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct fat_boot_fsinfo *fsinfo; if (!is_fat32(sbi)) return 0; bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; /* Sanity check */ if (!IS_FSINFO(fsinfo)) { fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); } else { if (sbi->free_clusters != -1) fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters); if (sbi->prev_free != -1) fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); mark_buffer_dirty(bh); } brelse(bh); return 0; } /* * fat_chain_add() adds a new cluster to the chain of clusters represented * by inode. */ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); int ret, new_fclus, last; /* * We must locate the last cluster of the file to add this new * one (new_dclus) to the end of the link list (the FAT). */ last = new_fclus = 0; if (MSDOS_I(inode)->i_start) { int fclus, dclus; ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); if (ret < 0) return ret; new_fclus = fclus + 1; last = dclus; } /* add new one to the last of the cluster chain */ if (last) { struct fat_entry fatent; fatent_init(&fatent); ret = fat_ent_read(inode, &fatent, last); if (ret >= 0) { int wait = inode_needs_sync(inode); ret = fat_ent_write(inode, &fatent, new_dclus, wait); fatent_brelse(&fatent); } if (ret < 0) return ret; /* * FIXME:Although we can add this cache, fat_cache_add() is * assuming to be called after linear search with fat_cache_id. */ // fat_cache_add(inode, new_fclus, new_dclus); } else { MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* * Since generic_write_sync() synchronizes regular files later, * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); if (ret) return ret; } else mark_inode_dirty(inode); } if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { fat_fs_error(sb, "clusters badly computed (%d != %llu)", new_fclus, (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); fat_cache_inval_inode(inode); } inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); return 0; } /* * The epoch of FAT timestamp is 1980. * : bits : value * date: 0 - 4: day (1 - 31) * date: 5 - 8: month (1 - 12) * date: 9 - 15: year (0 - 127) from 1980 * time: 0 - 4: sec (0 - 29) 2sec counts * time: 5 - 10: min (0 - 59) * time: 11 - 15: hour (0 - 23) */ #define SECS_PER_MIN 60 #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /* days between 1.1.70 and 1.1.80 (2 leap days) */ #define DAYS_DELTA (365 * 10 + 2) /* 120 (2100 - 1980) isn't leap year */ #define YEAR_2100 120 #define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) /* Linear day numbers of the respective 1sts in non-leap years. */ static long days_in_year[] = { /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; static inline int fat_tz_offset(const struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : sys_tz.tz_minuteswest) * SECS_PER_MIN; } /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) { u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); time64_t second; long day, leap_day, month, year; year = date >> 9; month = max(1, (date >> 5) & 0xf); day = max(1, date & 0x1f) - 1; leap_day = (year + 3) / 4; if (year > YEAR_2100) /* 2100 isn't leap year */ leap_day--; if (IS_LEAP_YEAR(year) && month > 2) leap_day++; second = (time & 0x1f) << 1; second += ((time >> 5) & 0x3f) * SECS_PER_MIN; second += (time >> 11) * SECS_PER_HOUR; second += (time64_t)(year * 365 + leap_day + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; second += fat_tz_offset(sbi); if (time_cs) { ts->tv_sec = second + (time_cs / 100); ts->tv_nsec = (time_cs % 100) * 10000000; } else { ts->tv_sec = second; ts->tv_nsec = 0; } } /* Export fat_time_fat2unix() for the fat_test KUnit tests. */ EXPORT_SYMBOL_GPL(fat_time_fat2unix); /* Convert linear UNIX date to a FAT time/date pair. */ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm); /* FAT can only support year between 1980 to 2107 */ if (tm.tm_year < 1980 - 1900) { *time = 0; *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); if (time_cs) *time_cs = 0; return; } if (tm.tm_year > 2107 - 1900) { *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); if (time_cs) *time_cs = 199; return; } /* from 1900 -> from 1980 */ tm.tm_year -= 80; /* 0~11 -> 1~12 */ tm.tm_mon++; /* 0~59 -> 0~29(2sec counts) */ tm.tm_sec >>= 1; *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec); *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday); if (time_cs) *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; } EXPORT_SYMBOL_GPL(fat_time_unix2fat); static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) { return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } /* * truncate atime to 24 hour granularity (00:00:00 in local timezone) */ struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, const struct timespec64 *ts) { /* to localtime */ time64_t seconds = ts->tv_sec - fat_tz_offset(sbi); s32 remainder; div_s64_rem(seconds, SECS_PER_DAY, &remainder); /* to day boundary, and back to unix time */ seconds = seconds + fat_tz_offset(sbi) - remainder; return (struct timespec64){ seconds, 0 }; } /* * truncate mtime to 2 second granularity */ struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts) { return fat_timespec64_trunc_2secs(*ts); } /* * truncate the various times with appropriate granularity: * all times in root node are always 0 */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); struct timespec64 ts; if (inode->i_ino == MSDOS_ROOT_INO) return 0; if (now == NULL) { now = &ts; ts = current_time(inode); } if (flags & S_ATIME) inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now)); /* * ctime and mtime share the same on-disk field, and should be * identical in memory. all mtime updates will be applied to ctime, * but ctime updates are ignored. */ if (flags & S_MTIME) inode_set_mtime_to_ts(inode, inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now))); return 0; } EXPORT_SYMBOL_GPL(fat_truncate_time); int fat_update_time(struct inode *inode, int flags) { int dirty_flags = 0; if (inode->i_ino == MSDOS_ROOT_INO) return 0; if (flags & (S_ATIME | S_CTIME | S_MTIME)) { fat_truncate_time(inode, NULL, flags); if (inode->i_sb->s_flags & SB_LAZYTIME) dirty_flags |= I_DIRTY_TIME; else dirty_flags |= I_DIRTY_SYNC; } __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL_GPL(fat_update_time); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { int i, err = 0; for (i = 0; i < nr_bhs; i++) write_dirty_buffer(bhs[i], 0); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (!err && !buffer_uptodate(bhs[i])) err = -EIO; } return err; }
92 92 182 47 47 47 92 1 86 220 22 41 49 92 130 41 161 2 23 10 20 64 110 138 138 138 54 4 47 5 7 48 1 49 49 49 49 54 11 49 49 160 161 161 24 156 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 // SPDX-License-Identifier: GPL-2.0 /* * Tty port functions */ #include <linux/types.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/serdev.h> #include "tty.h" static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return 0; ld = tty_ldisc_ref(tty); if (!ld) return 0; count = tty_ldisc_receive_buf(ld, p, f, count); tty_ldisc_deref(ld); return count; } static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return; ld = tty_ldisc_ref(tty); if (!ld) return; if (ld->ops->lookahead_buf) ld->ops->lookahead_buf(ld->tty, p, f, count); tty_ldisc_deref(ld); } static void tty_port_default_wakeup(struct tty_port *port) { struct tty_struct *tty = tty_port_tty_get(port); if (tty) { tty_wakeup(tty); tty_kref_put(tty); } } const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .lookahead_buf = tty_port_default_lookahead_buf, .write_wakeup = tty_port_default_wakeup, }; EXPORT_SYMBOL_GPL(tty_port_default_client_ops); /** * tty_port_init - initialize tty_port * @port: tty_port to initialize * * Initializes the state of struct tty_port. When a port was initialized using * this function, one has to destroy the port by tty_port_destroy(). Either * indirectly by using &tty_port refcounting (tty_port_put()) or directly if * refcounting is not used. */ void tty_port_init(struct tty_port *port) { memset(port, 0, sizeof(*port)); tty_buffer_init(port); init_waitqueue_head(&port->open_wait); init_waitqueue_head(&port->delta_msr_wait); mutex_init(&port->mutex); mutex_init(&port->buf_mutex); spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * Provide the tty layer with a link from a tty (specified by @index) to a * tty_port (@port). Use this only if neither tty_port_register_device() nor * tty_port_install() is used in the driver. If used, this has to be called * before tty_register_driver(). */ void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { if (WARN_ON(index >= driver->num)) return; driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); /** * tty_port_register_device - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * * It is the same as tty_register_device() except the provided @port is linked * to a concrete tty specified by @index. Use this or tty_port_install() (or * both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device) { return tty_port_register_device_attr(port, driver, index, device, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device); /** * tty_port_register_device_attr - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * It is the same as tty_register_device_attr() except the provided @port is * linked to a concrete tty specified by @index. Use this or tty_port_install() * (or both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); /** * tty_port_register_device_attr_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware device * @parent: parent if exists, otherwise NULL * @drvdata: driver data for the device * @attr_grp: attribute group for the device * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp) { struct device *dev; tty_port_link_device(port, driver, index); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { /* Skip creating cdev if we registered a serdev device */ return dev; } return tty_register_device_attr(driver, index, parent, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); /** * tty_port_register_device_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware controller device * @parent: parent if exists, otherwise NULL * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent) { return tty_port_register_device_attr_serdev(port, driver, index, host, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); /** * tty_port_unregister_device - deregister a tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * If a tty or serdev device is registered with a call to * tty_port_register_device_serdev() then this function must be called when * the device is gone. */ void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { int ret; ret = serdev_tty_port_unregister(port); if (ret == 0) return; tty_unregister_device(driver, index); } EXPORT_SYMBOL_GPL(tty_port_unregister_device); int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ mutex_lock(&port->buf_mutex); if (port->xmit_buf == NULL) { port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL); if (port->xmit_buf) kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE); } mutex_unlock(&port->buf_mutex); if (port->xmit_buf == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(tty_port_alloc_xmit_buf); void tty_port_free_xmit_buf(struct tty_port *port) { mutex_lock(&port->buf_mutex); free_page((unsigned long)port->xmit_buf); port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); mutex_unlock(&port->buf_mutex); } EXPORT_SYMBOL(tty_port_free_xmit_buf); /** * tty_port_destroy - destroy inited port * @port: tty port to be destroyed * * When a port was initialized using tty_port_init(), one has to destroy the * port by this function. Either indirectly by using &tty_port refcounting * (tty_port_put()) or directly if refcounting is not used. */ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); } EXPORT_SYMBOL(tty_port_destroy); static void tty_port_destructor(struct kref *kref) { struct tty_port *port = container_of(kref, struct tty_port, kref); /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); if (port->ops && port->ops->destruct) port->ops->destruct(port); else kfree(port); } /** * tty_port_put - drop a reference to tty_port * @port: port to drop a reference of (can be NULL) * * The final put will destroy and free up the @port using * @port->ops->destruct() hook, or using kfree() if not provided. */ void tty_port_put(struct tty_port *port) { if (port) kref_put(&port->kref, tty_port_destructor); } EXPORT_SYMBOL(tty_port_put); /** * tty_port_tty_get - get a tty reference * @port: tty port * * Return a refcount protected tty instance or %NULL if the port is not * associated with a tty (eg due to close or hangup). */ struct tty_struct *tty_port_tty_get(struct tty_port *port) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&port->lock, flags); tty = tty_kref_get(port->tty); spin_unlock_irqrestore(&port->lock, flags); return tty; } EXPORT_SYMBOL(tty_port_tty_get); /** * tty_port_tty_set - set the tty of a port * @port: tty port * @tty: the tty * * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL * to deassociate a port. */ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&port->lock, flags); tty_kref_put(port->tty); port->tty = tty_kref_get(tty); spin_unlock_irqrestore(&port->lock, flags); } EXPORT_SYMBOL(tty_port_tty_set); /** * tty_port_shutdown - internal helper to shutdown the device * @port: tty port to be shut down * @tty: the associated tty * * It is used by tty_port_hangup() and tty_port_close(). Its task is to * shutdown the device if it was initialized (note consoles remain * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes * @port->ops->shutdown(). */ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty) { mutex_lock(&port->mutex); if (port->console) goto out; if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Drop DTR/RTS if HUPCL is set. This causes any attached * modem to hang up the line. */ if (tty && C_HUPCL(tty)) tty_port_lower_dtr_rts(port); if (port->ops->shutdown) port->ops->shutdown(port); } out: mutex_unlock(&port->mutex); } /** * tty_port_hangup - hangup helper * @port: tty port * * Perform port level tty hangup flag and count changes. Drop the tty * reference. * * Caller holds tty lock. */ void tty_port_hangup(struct tty_port *port) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->count = 0; tty = port->tty; if (tty) set_bit(TTY_IO_ERROR, &tty->flags); port->tty = NULL; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_shutdown(port, tty); tty_kref_put(tty); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } EXPORT_SYMBOL(tty_port_hangup); /** * tty_port_tty_hangup - helper to hang up a tty * @port: tty port * @check_clocal: hang only ttys with %CLOCAL unset? */ void tty_port_tty_hangup(struct tty_port *port, bool check_clocal) { struct tty_struct *tty = tty_port_tty_get(port); if (tty && (!check_clocal || !C_CLOCAL(tty))) tty_hangup(tty); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(tty_port_tty_hangup); /** * tty_port_tty_wakeup - helper to wake up a tty * @port: tty port */ void tty_port_tty_wakeup(struct tty_port *port) { port->client_ops->write_wakeup(port); } EXPORT_SYMBOL_GPL(tty_port_tty_wakeup); /** * tty_port_carrier_raised - carrier raised check * @port: tty port * * Wrapper for the carrier detect logic. For the moment this is used * to hide some internal details. This will eventually become entirely * internal to the tty port. */ bool tty_port_carrier_raised(struct tty_port *port) { if (port->ops->carrier_raised == NULL) return true; return port->ops->carrier_raised(port); } EXPORT_SYMBOL(tty_port_carrier_raised); /** * tty_port_raise_dtr_rts - Raise DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_raise_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, true); } EXPORT_SYMBOL(tty_port_raise_dtr_rts); /** * tty_port_lower_dtr_rts - Lower DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_lower_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, false); } EXPORT_SYMBOL(tty_port_lower_dtr_rts); /** * tty_port_block_til_ready - Waiting logic for tty open * @port: the tty port being opened * @tty: the tty device being bound * @filp: the file pointer of the opener or %NULL * * Implement the core POSIX/SuS tty behaviour when opening a tty device. * Handles: * * - hangup (both before and during) * - non blocking open * - rts/dtr/dcd * - signals * - port flags and counts * * The passed @port must implement the @port->ops->carrier_raised method if it * can do carrier detect and the @port->ops->dtr_rts method if it supports * software management of these lines. Note that the dtr/rts raise is done each * iteration as a hangup may have previously dropped them while we wait. * * Caller holds tty lock. * * Note: May drop and reacquire tty lock when blocking, so @tty and @port may * have changed state (eg., may have been hung up). */ int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp) { int do_clocal = 0, retval; unsigned long flags; DEFINE_WAIT(wait); /* if non-blocking mode is set we can pass directly to open unless * the port has just hung up or is in another error state. */ if (tty_io_error(tty)) { tty_port_set_active(port, true); return 0; } if (filp == NULL || (filp->f_flags & O_NONBLOCK)) { /* Indicate we are open */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); tty_port_set_active(port, true); return 0; } if (C_CLOCAL(tty)) do_clocal = 1; /* Block waiting until we can proceed. We may need to wait for the * carrier, but we must also wait for any close that is in progress * before the next open may complete. */ retval = 0; /* The port lock protects the port counts */ spin_lock_irqsave(&port->lock, flags); port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); while (1) { /* Indicate we are open */ if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE); /* Check for a hangup or uninitialised port. * Return accordingly. */ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { if (port->flags & ASYNC_HUP_NOTIFY) retval = -EAGAIN; else retval = -ERESTARTSYS; break; } /* * Probe the carrier. For devices with no carrier detect * tty_port_carrier_raised will always return true. * Never ask drivers if CLOCAL is set, this causes troubles * on some hardware. */ if (do_clocal || tty_port_carrier_raised(port)) break; if (signal_pending(current)) { retval = -ERESTARTSYS; break; } tty_unlock(tty); schedule(); tty_lock(tty); } finish_wait(&port->open_wait, &wait); /* Update counts. A parallel hangup will have set count to zero and * we must not mess that up further. */ spin_lock_irqsave(&port->lock, flags); if (!tty_hung_up_p(filp)) port->count++; port->blocked_open--; spin_unlock_irqrestore(&port->lock, flags); if (retval == 0) tty_port_set_active(port, true); return retval; } EXPORT_SYMBOL(tty_port_block_til_ready); static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty) { unsigned int bps = tty_get_baud_rate(tty); long timeout; if (bps > 1200) { timeout = (HZ * 10 * port->drain_delay) / bps; timeout = max_t(long, timeout, HZ / 10); } else { timeout = 2 * HZ; } schedule_timeout_interruptible(timeout); } /** * tty_port_close_start - helper for tty->ops->close, part 1/2 * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * Decrements and checks open count. Flushes the port if this is the last * close. That means, dropping the data from the outpu buffer on the device and * waiting for sending logic to finish. The rest of close handling is performed * in tty_port_close_end(). * * Locking: Caller holds tty lock. * * Return: 1 if this is the last close, otherwise 0 */ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp) { unsigned long flags; if (tty_hung_up_p(filp)) return 0; spin_lock_irqsave(&port->lock, flags); if (tty->count == 1 && port->count != 1) { tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__, port->count); port->count = 1; } if (--port->count < 0) { tty_warn(tty, "%s: bad port count (%d)\n", __func__, port->count); port->count = 0; } if (port->count) { spin_unlock_irqrestore(&port->lock, flags); return 0; } spin_unlock_irqrestore(&port->lock, flags); tty->closing = 1; if (tty_port_initialized(port)) { /* Don't block on a stalled port, just pull the chain */ if (tty->flow.tco_stopped) tty_driver_flush_buffer(tty); if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, port->closing_wait); if (port->drain_delay) tty_port_drain_delay(port, tty); } /* Flush the ldisc buffering */ tty_ldisc_flush(tty); /* Report to caller this is the last port reference */ return 1; } EXPORT_SYMBOL(tty_port_close_start); /** * tty_port_close_end - helper for tty->ops->close, part 2/2 * @port: tty_port of the device * @tty: tty being closed * * This is a continuation of the first part: tty_port_close_start(). This * should be called after turning off the device. It flushes the data from the * line discipline and delays the close by @port->close_delay. * * Locking: Caller holds tty lock. */ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; tty_ldisc_flush(tty); tty->closing = 0; spin_lock_irqsave(&port->lock, flags); if (port->blocked_open) { spin_unlock_irqrestore(&port->lock, flags); if (port->close_delay) msleep_interruptible(jiffies_to_msecs(port->close_delay)); spin_lock_irqsave(&port->lock, flags); wake_up_interruptible(&port->open_wait); } spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); } EXPORT_SYMBOL(tty_port_close_end); /** * tty_port_close - generic tty->ops->close handler * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->close. It wraps a * sequence of tty_port_close_start(), tty_port_shutdown(), and * tty_port_close_end(). The latter two are called only if this is the last * close. See the respective functions for the details. * * Locking: Caller holds tty lock */ void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp) { if (tty_port_close_start(port, tty, filp) == 0) return; tty_port_shutdown(port, tty); if (!port->console) set_bit(TTY_IO_ERROR, &tty->flags); tty_port_close_end(port, tty); tty_port_tty_set(port, NULL); } EXPORT_SYMBOL(tty_port_close); /** * tty_port_install - generic tty->ops->install handler * @port: tty_port of the device * @driver: tty_driver for this device * @tty: tty to be installed * * It is the same as tty_standard_install() except the provided @port is linked * to a concrete tty specified by @tty. Use this or tty_port_register_device() * (or both). Call tty_port_link_device() as a last resort. */ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); /** * tty_port_open - generic tty->ops->open handler * @port: tty_port of the device * @tty: tty to be opened * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->open. It activates * the devices using @port->ops->activate if not active already. And waits for * the device to be ready using tty_port_block_til_ready() (e.g. raises * DTR/CTS and waits for carrier). * * Note that @port->ops->shutdown is not called when @port->ops->activate * returns an error (on the contrary, @tty->ops->close is). * * Locking: Caller holds tty lock. * * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so * @tty and @port may have changed state (eg., may be hung up now). */ int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp) { spin_lock_irq(&port->lock); ++port->count; spin_unlock_irq(&port->lock); tty_port_tty_set(port, tty); /* * Do the device-specific open only if the hardware isn't * already initialized. Serialize open and shutdown using the * port mutex. */ mutex_lock(&port->mutex); if (!tty_port_initialized(port)) { clear_bit(TTY_IO_ERROR, &tty->flags); if (port->ops->activate) { int retval = port->ops->activate(port, tty); if (retval) { mutex_unlock(&port->mutex); return retval; } } tty_port_set_initialized(port, true); } mutex_unlock(&port->mutex); return tty_port_block_til_ready(port, tty, filp); } EXPORT_SYMBOL(tty_port_open);
46 34 47 13987 2178 2149 31 1469 1439 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance);
45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 // SPDX-License-Identifier: GPL-2.0-only /* * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 * * bitmap_create - sets up the bitmap structure * bitmap_destroy - destroys the bitmap structure * * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: * - added disk storage for bitmap * - changes to allow various bitmap chunk sizes */ /* * Still to do: * * flush after percent set rather than just time based. (maybe both). */ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/sched.h> #include <linux/list.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> #include <linux/seq_file.h> #include <trace/events/block.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order * with version 3, it is host-endian which is non-portable * Version 5 is currently set only for clustered devices */ #define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MAJOR_HOSTENDIAN 3 /* * in-memory bitmap: * * Use 16 bit block counters to track pending writes to each "chunk". * The 2 high order bits are special-purpose, the first is a flag indicating * whether a resync is needed. The second is a flag indicating whether a * resync is active. * This means that the counter is actually 14 bits: * * +--------+--------+------------------------------------------------+ * | resync | resync | counter | * | needed | active | | * | (0-1) | (0-1) | (0-16383) | * +--------+--------+------------------------------------------------+ * * The "resync needed" bit is set when: * a '1' bit is read from storage at startup. * a write request fails on some drives * a resync is aborted on a chunk with 'resync active' set * It is cleared (and resync-active set) when a resync starts across all drives * of the chunk. * * * The "resync active" bit is set when: * a resync is started on all drives, and resync_needed is set. * resync_needed will be cleared (as long as resync_active wasn't already set). * It is cleared when a resync completes. * * The counter counts pending write requests, plus the on-disk bit. * When the counter is '1' and the resync bits are clear, the on-disk * bit can be cleared as well, thus setting the counter to 0. * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * * If the counter is 0, the on-disk bit is clear and the stripe is clean * Anything that dirties the stripe pushes the counter to 2 (at least) * and sets the on-disk bit (lazily). * If a periodic sweep find the counter at 2, it is decremented to 1. * If the sweep find the counter at 1, the on-disk bit is cleared and the * counter goes to zero. * * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block * counters as a fallback when "page" memory cannot be allocated: * * Normal case (page memory allocated): * * page pointer (32-bit) * * [ ] ------+ * | * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) * c1 c2 c2048 * * Hijacked case (page memory allocation failed): * * hijacked page pointer (32-bit) * * [ ][ ] (no page memory allocated) * counter #1 (16-bit) counter #2 (16-bit) * */ #define PAGE_BITS (PAGE_SIZE << 3) #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) /* how many counters per page? */ #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) /* same, except a shift value for more efficient bitops */ #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) /* same, except a mask value for more efficient bitops */ #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) #define BITMAP_BLOCK_SHIFT 9 /* * bitmap structures: */ /* the in-memory bitmap is represented by bitmap_pages */ struct bitmap_page { /* * map points to the actual memory page */ char *map; /* * in emergencies (when map cannot be alloced), hijack the map * pointer and use it as two counters itself */ unsigned int hijacked:1; /* * If any counter in this page is '1' or '2' - and so could be * cleared then that page is marked as 'pending' */ unsigned int pending:1; /* * count of dirty bits on the page */ unsigned int count:30; }; /* the main bitmap structure - one per mddev */ struct bitmap { struct bitmap_counts { spinlock_t lock; struct bitmap_page *bp; /* total number of pages in the bitmap */ unsigned long pages; /* number of pages not yet allocated */ unsigned long missing_pages; /* chunksize = 2^chunkshift (for bitops) */ unsigned long chunkshift; /* total number of data chunks for the array */ unsigned long chunks; } counts; struct mddev *mddev; /* the md device that the bitmap is for */ __u64 events_cleared; int need_sync; struct bitmap_storage { /* backing disk file */ struct file *file; /* cached copy of the bitmap file superblock */ struct page *sb_page; unsigned long sb_index; /* list of cache pages for the file */ struct page **filemap; /* attributes associated filemap pages */ unsigned long *filemap_attr; /* number of pages in the file */ unsigned long file_pages; /* total bytes in the bitmap */ unsigned long bytes; } storage; unsigned long flags; int allclean; atomic_t behind_writes; /* highest actual value at runtime */ unsigned long behind_writes_used; /* * the bitmap daemon - periodically wakes up and sweeps the bitmap * file, cleaning up bits and flushing out pages to disk as necessary */ unsigned long daemon_lastrun; /* jiffies of last run */ /* * when we lasted called end_sync to update bitmap with resync * progress. */ unsigned long last_end_sync; /* pending writes to the bitmap file */ atomic_t pending_writes; wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; wait_queue_head_t behind_wait; struct kernfs_node *sysfs_can_clear; /* slot offset for clustered env */ int cluster_slot; }; static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } static bool __bitmap_enabled(struct bitmap *bitmap) { return bitmap->storage.filemap && !test_bit(BITMAP_STALE, &bitmap->flags); } static bool bitmap_enabled(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return false; return __bitmap_enabled(bitmap); } /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * * 1) check to see if this page is allocated, if it's not then try to alloc * 2) if the alloc fails, set the page's hijacked flag so we'll use the * page pointer directly as a counter * * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ static int md_bitmap_checkpage(struct bitmap_counts *bitmap, unsigned long page, int create, int no_hijack) __releases(bitmap->lock) __acquires(bitmap->lock) { unsigned char *mappage; WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; if (bitmap->bp[page].map) /* page is already allocated, just return */ return 0; if (!create) return -ENOENT; /* this page has not been allocated yet */ spin_unlock_irq(&bitmap->lock); /* It is possible that this is being called inside a * prepare_to_wait/finish_wait loop from raid5c:make_request(). * In general it is not permitted to sleep in that context as it * can cause the loop to spin freely. * That doesn't apply here as we can only reach this point * once with any loop. * When this function completes, either bp[page].map or * bp[page].hijacked. In either case, this function will * abort before getting to this point again. So there is * no risk of a free-spin, and so it is safe to assert * that sleeping here is allowed. */ sched_annotate_sleep(); mappage = kzalloc(PAGE_SIZE, GFP_NOIO); spin_lock_irq(&bitmap->lock); if (mappage == NULL) { pr_debug("md/bitmap: map page allocation failed, hijacking\n"); /* We don't support hijack for cluster raid */ if (no_hijack) return -ENOMEM; /* failed - set the hijacked flag so that we can use the * pointer as a counter */ if (!bitmap->bp[page].map) bitmap->bp[page].hijacked = 1; } else if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ kfree(mappage); } else { /* no page was in place and we have one, so install it */ bitmap->bp[page].map = mappage; bitmap->missing_pages--; } return 0; } /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) { char *ptr; if (bitmap->bp[page].count) /* page is still busy */ return; /* page is no longer in use, it can be released */ if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ bitmap->bp[page].hijacked = 0; bitmap->bp[page].map = NULL; } else { /* normal case, free the page */ ptr = bitmap->bp[page].map; bitmap->bp[page].map = NULL; bitmap->missing_pages++; kfree(ptr); } } /* * bitmap file handling - read and write the bitmap file and its superblock */ /* * basic page I/O operations */ /* IO operations when bitmap is stored near all superblocks */ /* choose a good rdev and read the page from there */ static int read_sb_page(struct mddev *mddev, loff_t offset, struct page *page, unsigned long index, int size) { sector_t sector = mddev->bitmap_info.offset + offset + index * (PAGE_SIZE / SECTOR_SIZE); struct md_rdev *rdev; rdev_for_each(rdev, mddev) { u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev)); if (!test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags) || test_bit(Bitmap_sync, &rdev->flags)) continue; if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true)) return 0; } return -EIO; } static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) { /* Iterate the disks of an mddev, using rcu to protect access to the * linked list, and raising the refcount of devices we return to ensure * they don't disappear while in use. * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter * list_for_each_entry_continue_rcu. * * Note that if entered with 'rdev == NULL' to start at the * beginning, we temporarily assign 'rdev' to an address which * isn't really an rdev, but which can be used by * list_for_each_entry_continue_rcu() to find the first entry. */ rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ rdev = list_entry(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); } list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); rcu_read_unlock(); return rdev; } } rcu_read_unlock(); return NULL; } static unsigned int optimal_io_size(struct block_device *bdev, unsigned int last_page_size, unsigned int io_size) { if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev)) return roundup(last_page_size, bdev_io_opt(bdev)); return io_size; } static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, loff_t start, loff_t boundary) { if (io_size != opt_size && start + opt_size / SECTOR_SIZE <= boundary) return opt_size; if (start + io_size / SECTOR_SIZE <= boundary) return io_size; /* Overflows boundary */ return 0; } static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, unsigned long pg_index, struct page *page) { struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; unsigned long num_pages = bitmap->storage.file_pages; unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; /* we compare length (page numbers), not page offset. */ if ((pg_index - store->sb_index) == num_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) last_page_size = PAGE_SIZE; size = roundup(last_page_size, bdev_logical_block_size(bdev)); opt_size = optimal_io_size(bdev, last_page_size, size); } sboff = rdev->sb_start + offset; doff = rdev->data_offset; /* Just make sure we aren't corrupting data or metadata */ if (mddev->external) { /* Bitmap could be anywhere. */ if (sboff + ps > doff && sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE)) return -EINVAL; } else if (offset < 0) { /* DATA BITMAP METADATA */ size = bitmap_io_size(size, opt_size, offset + ps, 0); if (size == 0) /* bitmap runs in to metadata */ return -EINVAL; if (doff + mddev->dev_sectors > sboff) /* data runs in to bitmap */ return -EINVAL; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ size = bitmap_io_size(size, opt_size, sboff + ps, doff); if (size == 0) /* bitmap runs in to data */ return -EINVAL; } md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); return 0; } static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, struct page *page, bool wait) { struct mddev *mddev = bitmap->mddev; do { struct md_rdev *rdev = NULL; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) { set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); return; } } } while (wait && md_super_wait(mddev) < 0); } static void md_bitmap_file_kick(struct bitmap *bitmap); #ifdef CONFIG_MD_BITMAP_FILE static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) { struct buffer_head *bh = page_buffers(page); while (bh && bh->b_blocknr) { atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); bh = bh->b_this_page; } if (wait) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes) == 0); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) { struct bitmap *bitmap = bh->b_private; if (!uptodate) set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); if (atomic_dec_and_test(&bitmap->pending_writes)) wake_up(&bitmap->write_wait); } static void free_buffers(struct page *page) { struct buffer_head *bh; if (!PagePrivate(page)) return; bh = page_buffers(page); while (bh) { struct buffer_head *next = bh->b_this_page; free_buffer_head(bh); bh = next; } detach_page_private(page); put_page(page); } /* read a page from a file. * We both read the page, and attach buffers to the page to record the * address of each block (using bmap). These addresses will be used * to write the block later, completely bypassing the filesystem. * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ static int read_file_page(struct file *file, unsigned long index, struct bitmap *bitmap, unsigned long count, struct page *page) { int ret = 0; struct inode *inode = file_inode(file); struct buffer_head *bh; sector_t block, blk_cur; unsigned long blocksize = i_blocksize(inode); pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); bh = alloc_page_buffers(page, blocksize); if (!bh) { ret = -ENOMEM; goto out; } attach_page_private(page, bh); blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { block = blk_cur; if (count == 0) bh->b_blocknr = 0; else { ret = bmap(inode, &block); if (ret || !block) { ret = -EINVAL; bh->b_blocknr = 0; goto out; } bh->b_blocknr = block; bh->b_bdev = inode->i_sb->s_bdev; if (count < blocksize) count = 0; else count -= blocksize; bh->b_end_io = end_bitmap_write; bh->b_private = bitmap; atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); submit_bh(REQ_OP_READ, bh); } blk_cur++; bh = bh->b_this_page; } wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) ret = -EIO; out: if (ret) pr_err("md: bitmap read error: (%dB @ %llu): %d\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT, ret); return ret; } #else /* CONFIG_MD_BITMAP_FILE */ static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) { } static int read_file_page(struct file *file, unsigned long index, struct bitmap *bitmap, unsigned long count, struct page *page) { return -EIO; } static void free_buffers(struct page *page) { put_page(page); } #endif /* CONFIG_MD_BITMAP_FILE */ /* * bitmap file superblock operations */ /* * write out a page to a file */ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, bool wait) { struct bitmap_storage *store = &bitmap->storage; struct page *page = store->filemap[pg_index]; if (mddev_is_clustered(bitmap->mddev)) { /* go to node bitmap area starting point */ pg_index += store->sb_index; } if (store->file) write_file_page(bitmap, page, wait); else write_sb_page(bitmap, pg_index, page, wait); } /* * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from * md_bitmap_daemon_work(), have completed. */ static void md_bitmap_wait_writes(struct bitmap *bitmap) { if (bitmap->storage.file) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); else /* Note that we ignore the return value. The writes * might have failed, but that would just mean that * some bits which should be cleared haven't been, * which is safe. The relevant bitmap blocks will * probably get written again, but there is no great * loss if they aren't. */ md_super_wait(bitmap->mddev); } /* update the event counter and sync the superblock to disk */ static void bitmap_update_sb(void *data) { bitmap_super_t *sb; struct bitmap *bitmap = data; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; if (bitmap->mddev->bitmap_info.external) return; if (!bitmap->storage.sb_page) /* no superblock */ return; sb = kmap_local_page(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); /* * clear BITMAP_WRITE_ERROR bit to protect against the case that * a bitmap write error occurred but the later writes succeeded. */ sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); /* This might have been changed by a reshape */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_local(sb); if (bitmap->storage.file) write_file_page(bitmap, bitmap->storage.sb_page, 1); else write_sb_page(bitmap, bitmap->storage.sb_index, bitmap->storage.sb_page, 1); } static void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; if (!bitmap || !bitmap->storage.sb_page) return; sb = kmap_local_page(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %u\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", le32_to_cpu(*(__le32 *)(sb->uuid+0)), le32_to_cpu(*(__le32 *)(sb->uuid+4)), le32_to_cpu(*(__le32 *)(sb->uuid+8)), le32_to_cpu(*(__le32 *)(sb->uuid+12))); pr_debug(" events: %llu\n", (unsigned long long) le64_to_cpu(sb->events)); pr_debug("events cleared: %llu\n", (unsigned long long) le64_to_cpu(sb->events_cleared)); pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); kunmap_local(sb); } /* * bitmap_new_disk_sb * @bitmap * * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb * reads and verifies the on-disk bitmap superblock and populates bitmap_info. * This function verifies 'bitmap_info' and populates the on-disk bitmap * structure, which is to be written to disk. * * Returns: 0 on success, -Exxx on error */ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; bitmap->storage.sb_index = 0; sb = kmap_local_page(bitmap->storage.sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { kunmap_local(sb); pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL; } sb->chunksize = cpu_to_le32(chunksize); daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { pr_debug("Choosing daemon_sleep default (5 sec)\n"); daemon_sleep = 5 * HZ; } sb->daemon_sleep = cpu_to_le32(daemon_sleep); bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; /* * FIXME: write_behind for RAID1. If not specified, what * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. */ write_behind = bitmap->mddev->bitmap_info.max_write_behind; if (write_behind > COUNTER_MAX) write_behind = COUNTER_MAX / 2; sb->write_behind = cpu_to_le32(write_behind); bitmap->mddev->bitmap_info.max_write_behind = write_behind; /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); memcpy(sb->uuid, bitmap->mddev->uuid, 16); set_bit(BITMAP_STALE, &bitmap->flags); sb->state = cpu_to_le32(bitmap->flags); bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->mddev->events); bitmap->mddev->bitmap_info.nodes = 0; kunmap_local(sb); return 0; } /* read the superblock from the bitmap file and initialize some bitmap fields */ static int md_bitmap_read_sb(struct bitmap *bitmap) { char *reason = NULL; bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; unsigned long long events; int nodes = 0; unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; loff_t offset = 0; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; daemon_sleep = 5 * HZ; write_behind = 0; set_bit(BITMAP_STALE, &bitmap->flags); err = 0; goto out_no_sb; } /* page 0 is the superblock, read it... */ sb_page = alloc_page(GFP_KERNEL); if (!sb_page) return -ENOMEM; bitmap->storage.sb_page = sb_page; re_read: /* If cluster_slot is set, the cluster is setup */ if (bitmap->cluster_slot >= 0) { sector_t bm_blocks = bitmap->mddev->resync_max_sectors; bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, (bitmap->mddev->bitmap_info.chunksize >> 9)); /* bits to bytes */ bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); offset = bitmap->cluster_slot * (bm_blocks << 3); pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, bitmap->cluster_slot, offset); } if (bitmap->storage.file) { loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; err = read_file_page(bitmap->storage.file, 0, bitmap, bytes, sb_page); } else { err = read_sb_page(bitmap->mddev, offset, sb_page, 0, sizeof(bitmap_super_t)); } if (err) return err; err = -EINVAL; sb = kmap_local_page(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) reason = "bad magic"; else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) reason = "unrecognized superblock version"; else if (chunksize < 512) reason = "bitmap chunksize too small"; else if (!is_power_of_2(chunksize)) reason = "bitmap chunksize not a power of 2"; else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) reason = "daemon sleep period out of range"; else if (write_behind > COUNTER_MAX) reason = "write-behind limit out of range (0 - 16383)"; if (reason) { pr_warn("%s: invalid bitmap file superblock: %s\n", bmname(bitmap), reason); goto out; } /* * Setup nodes/clustername only if bitmap version is * cluster-compatible */ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { nodes = le32_to_cpu(sb->nodes); strscpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); } /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); if (bitmap->mddev->persistent) { /* * We have a persistent array superblock, so compare the * bitmap's UUID and event counter to the mddev's */ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { pr_warn("%s: bitmap superblock UUID mismatch\n", bmname(bitmap)); goto out; } events = le64_to_cpu(sb->events); if (!nodes && (events < bitmap->mddev->events)) { pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", bmname(bitmap), events, (unsigned long long) bitmap->mddev->events); set_bit(BITMAP_STALE, &bitmap->flags); } } /* assign fields using values from superblock */ bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); err = 0; out: kunmap_local(sb); if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { /* Assigning chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_warn("%s: Could not setup cluster service (%d)\n", bmname(bitmap), err); goto out_no_sb; } bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev); goto re_read; } out_no_sb: if (err == 0) { if (test_bit(BITMAP_STALE, &bitmap->flags)) bitmap->events_cleared = bitmap->mddev->events; bitmap->mddev->bitmap_info.chunksize = chunksize; bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; bitmap->mddev->bitmap_info.max_write_behind = write_behind; bitmap->mddev->bitmap_info.nodes = nodes; if (bitmap->mddev->bitmap_info.space == 0 || bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; } else { bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); } return err; } /* * general bitmap file operations */ /* * on-disk bitmap: * * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap * file a page at a time. There's a superblock at the start of the file. */ /* calculate the index of the page that contains this bit */ static inline unsigned long file_page_index(struct bitmap_storage *store, unsigned long chunk) { if (store->sb_page) chunk += sizeof(bitmap_super_t) << 3; return chunk >> PAGE_BIT_SHIFT; } /* calculate the (bit) offset of this bit within a page */ static inline unsigned long file_page_offset(struct bitmap_storage *store, unsigned long chunk) { if (store->sb_page) chunk += sizeof(bitmap_super_t) << 3; return chunk & (PAGE_BITS - 1); } /* * return a pointer to the page in the filemap that contains the given bit * */ static inline struct page *filemap_get_page(struct bitmap_storage *store, unsigned long chunk) { if (file_page_index(store, chunk) >= store->file_pages) return NULL; return store->filemap[file_page_index(store, chunk)]; } static int md_bitmap_storage_alloc(struct bitmap_storage *store, unsigned long chunks, int with_super, int slot_number) { int pnum, offset = 0; unsigned long num_pages; unsigned long bytes; bytes = DIV_ROUND_UP(chunks, 8); if (with_super) bytes += sizeof(bitmap_super_t); num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); offset = slot_number * num_pages; store->filemap = kmalloc_array(num_pages, sizeof(struct page *), GFP_KERNEL); if (!store->filemap) return -ENOMEM; if (with_super && !store->sb_page) { store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (store->sb_page == NULL) return -ENOMEM; } pnum = 0; if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; store->sb_index = offset; } for ( ; pnum < num_pages; pnum++) { store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!store->filemap[pnum]) { store->file_pages = pnum; return -ENOMEM; } } store->file_pages = pnum; /* We need 4 bits per page, rounded up to a multiple * of sizeof(unsigned long) */ store->filemap_attr = kzalloc( roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), GFP_KERNEL); if (!store->filemap_attr) return -ENOMEM; store->bytes = bytes; return 0; } static void md_bitmap_file_unmap(struct bitmap_storage *store) { struct file *file = store->file; struct page *sb_page = store->sb_page; struct page **map = store->filemap; int pages = store->file_pages; while (pages--) if (map[pages] != sb_page) /* 0 is sb_page, release it below */ free_buffers(map[pages]); kfree(map); kfree(store->filemap_attr); if (sb_page) free_buffers(sb_page); if (file) { struct inode *inode = file_inode(file); invalidate_mapping_pages(inode->i_mapping, 0, -1); fput(file); } } /* * bitmap_file_kick - if an error occurs while manipulating the bitmap file * then it is no longer reliable, so we stop using it and we mark the file * as failed in the superblock */ static void md_bitmap_file_kick(struct bitmap *bitmap) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { bitmap_update_sb(bitmap); if (bitmap->storage.file) { pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", bmname(bitmap), bitmap->storage.file); } else pr_warn("%s: disabling internal bitmap due to errors\n", bmname(bitmap)); } } enum bitmap_page_attr { BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. * i.e. counter is 1 or 2. */ BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ }; static inline void set_page_attr(struct bitmap *bitmap, int pnum, enum bitmap_page_attr attr) { set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } static inline void clear_page_attr(struct bitmap *bitmap, int pnum, enum bitmap_page_attr attr) { clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } static inline int test_page_attr(struct bitmap *bitmap, int pnum, enum bitmap_page_attr attr) { return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, enum bitmap_page_attr attr) { return test_and_clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } /* * bitmap_file_set_bit -- called before performing a write to the md device * to set (and eventually sync) a particular bit in the bitmap file * * we set the bit immediately, then we record the page number so that * when an unplug occurs, we can flush the dirty pages out to disk */ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) return; bit = file_page_offset(&bitmap->storage, chunk); /* set the bit */ kaddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else set_bit_le(bit, kaddr); kunmap_local(kaddr); pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); } static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) return; bit = file_page_offset(&bitmap->storage, chunk); paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else clear_bit_le(bit, paddr); kunmap_local(paddr); if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) { unsigned long bit; struct page *page; void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; int set = 0; page = filemap_get_page(&bitmap->storage, chunk); if (!page) return -EINVAL; bit = file_page_offset(&bitmap->storage, chunk); paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set = test_bit(bit, paddr); else set = test_bit_le(bit, paddr); kunmap_local(paddr); return set; } /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ static void __bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; int writing = 0; if (!__bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be * flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) { dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); if (dirty || need_write) { if (!writing) { md_bitmap_wait_writes(bitmap); mddev_add_trace_msg(bitmap->mddev, "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); filemap_write_page(bitmap, i, false); writing = 1; } } if (writing) md_bitmap_wait_writes(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) md_bitmap_file_kick(bitmap); } struct bitmap_unplug_work { struct work_struct work; struct bitmap *bitmap; struct completion *done; }; static void md_bitmap_unplug_fn(struct work_struct *work) { struct bitmap_unplug_work *unplug_work = container_of(work, struct bitmap_unplug_work, work); __bitmap_unplug(unplug_work->bitmap); complete(unplug_work->done); } static void bitmap_unplug_async(struct bitmap *bitmap) { DECLARE_COMPLETION_ONSTACK(done); struct bitmap_unplug_work unplug_work; INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); unplug_work.bitmap = bitmap; unplug_work.done = &done; queue_work(md_bitmap_wq, &unplug_work.work); wait_for_completion(&done); destroy_work_on_stack(&unplug_work.work); } static void bitmap_unplug(struct mddev *mddev, bool sync) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; if (sync) __bitmap_unplug(bitmap); else bitmap_unplug_async(bitmap); } static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory * mapping of the bitmap file. * * Special case: If there's no bitmap file, or if the bitmap file had been * previously kicked from the array, we mark all the bits as 1's in order to * cause a full resync. * * We ignore all bits for sectors that end earlier than 'start'. * This is used when reading an out-of-date bitmap. */ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); struct mddev *mddev = bitmap->mddev; unsigned long chunks = bitmap->counts.chunks; struct bitmap_storage *store = &bitmap->storage; struct file *file = store->file; unsigned long node_offset = 0; unsigned long bit_cnt = 0; unsigned long i; int ret; if (!file && !mddev->bitmap_info.offset) { /* No permanent bitmap - fill with '1s'. */ store->filemap = NULL; store->file_pages = 0; for (i = 0; i < chunks ; i++) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) >= start); md_bitmap_set_memory_bits(bitmap, (sector_t)i << bitmap->counts.chunkshift, needed); } return 0; } if (file && i_size_read(file->f_mapping->host) < store->bytes) { pr_warn("%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), store->bytes); ret = -ENOSPC; goto err; } if (mddev_is_clustered(mddev)) node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); for (i = 0; i < store->file_pages; i++) { struct page *page = store->filemap[i]; int count; /* unmap the old page, we're done with it */ if (i == store->file_pages - 1) count = store->bytes - i * PAGE_SIZE; else count = PAGE_SIZE; if (file) ret = read_file_page(file, i, bitmap, count, page); else ret = read_sb_page(mddev, 0, page, i + node_offset, count); if (ret) goto err; } if (outofdate) { pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); for (i = 0; i < store->file_pages; i++) { struct page *page = store->filemap[i]; unsigned long offset = 0; void *paddr; if (i == 0 && !mddev->bitmap_info.external) offset = sizeof(bitmap_super_t); /* * If the bitmap is out of date, dirty the whole page * and write it out */ paddr = kmap_local_page(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); kunmap_local(paddr); filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { ret = -EIO; goto err; } } } for (i = 0; i < chunks; i++) { struct page *page = filemap_get_page(&bitmap->storage, i); unsigned long bit = file_page_offset(&bitmap->storage, i); void *paddr; bool was_set; paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) was_set = test_bit(bit, paddr); else was_set = test_bit_le(bit, paddr); kunmap_local(paddr); if (was_set) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); md_bitmap_set_memory_bits(bitmap, (sector_t)i << bitmap->counts.chunkshift, needed); bit_cnt++; } } pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", bmname(bitmap), store->file_pages, bit_cnt, chunks); return 0; err: pr_warn("%s: bitmap initialisation failed: %d\n", bmname(bitmap), ret); return ret; } /* just flag bitmap pages as needing to be written. */ static void bitmap_write_all(struct mddev *mddev) { int i; struct bitmap *bitmap = mddev->bitmap; if (!bitmap || !bitmap->storage.filemap) return; /* Only one copy, so nothing needed */ if (bitmap->storage.file) return; for (i = 0; i < bitmap->storage.file_pages; i++) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; } static void md_bitmap_count_page(struct bitmap_counts *bitmap, sector_t offset, int inc) { sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; md_bitmap_checkfree(bitmap, page); } static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) { sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; struct bitmap_page *bp = &bitmap->bp[page]; if (!bp->pending) bp->pending = 1; } static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, bool force) { struct md_thread *thread; rcu_read_lock(); thread = rcu_dereference(mddev->thread); if (!thread) goto out; if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) thread->timeout = timeout; out: rcu_read_unlock(); } /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ static void bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; unsigned long j; unsigned long nextpage; sector_t blocks; struct bitmap_counts *counts; /* Use a mutex to guard daemon_work against * bitmap_destroy. */ mutex_lock(&mddev->bitmap_info.mutex); bitmap = mddev->bitmap; if (bitmap == NULL) { mutex_unlock(&mddev->bitmap_info.mutex); return; } if (time_before(jiffies, bitmap->daemon_lastrun + mddev->bitmap_info.daemon_sleep)) goto done; bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work"); /* Any file-page which is PENDING now needs to be written. * So set NEEDWRITE now, then after we make any last-minute changes * we will write it. */ for (j = 0; j < bitmap->storage.file_pages; j++) if (test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE); if (bitmap->need_sync && mddev->bitmap_info.external == 0) { /* Arrange for superblock update as well as * other changes */ bitmap_super_t *sb; bitmap->need_sync = 0; if (bitmap->storage.filemap) { sb = kmap_local_page(bitmap->storage.sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); kunmap_local(sb); set_page_attr(bitmap, 0, BITMAP_PAGE_NEEDWRITE); } } /* Now look at the bitmap counters and if any are '2' or '1', * decrement and handle accordingly. */ counts = &bitmap->counts; spin_lock_irq(&counts->lock); nextpage = 0; for (j = 0; j < counts->chunks; j++) { bitmap_counter_t *bmc; sector_t block = (sector_t)j << counts->chunkshift; if (j == nextpage) { nextpage += PAGE_COUNTER_RATIO; if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { j |= PAGE_COUNTER_MASK; continue; } counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; } bmc = md_bitmap_get_counter(counts, block, &blocks, 0); if (!bmc) { j |= PAGE_COUNTER_MASK; continue; } if (*bmc == 1 && !bitmap->need_sync) { /* We can clear the bit */ *bmc = 0; md_bitmap_count_page(counts, block, -1); md_bitmap_file_clear_bit(bitmap, block); } else if (*bmc && *bmc <= 2) { *bmc = 1; md_bitmap_set_pending(counts, block); bitmap->allclean = 0; } } spin_unlock_irq(&counts->lock); md_bitmap_wait_writes(bitmap); /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. * DIRTY pages need to be written by bitmap_unplug so it can wait * for them. * If we find any DIRTY page we stop there and let bitmap_unplug * handle all the rest. This is important in the case where * the first blocking holds the superblock and it has been updated. * We mustn't write any other blocks before the superblock. */ for (j = 0; j < bitmap->storage.file_pages && !test_bit(BITMAP_STALE, &bitmap->flags); j++) { if (test_page_attr(bitmap, j, BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; if (bitmap->storage.filemap && test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE)) filemap_write_page(bitmap, j, false); } done: if (bitmap->allclean == 0) mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create) __releases(bitmap->lock) __acquires(bitmap->lock) { /* If 'create', we might release the lock and reclaim it. * The lock must have been taken with interrupts enabled. * If !create, we don't release the lock. */ sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize = ((sector_t)1) << bitmap->chunkshift; int err; if (page >= bitmap->pages) { /* * This can happen if bitmap_start_sync goes beyond * End-of-device while looking for a whole page or * user set a huge number to sysfs bitmap_set_bits. */ *blocks = csize - (offset & (csize - 1)); return NULL; } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + PAGE_COUNTER_SHIFT); *blocks = csize - (offset & (csize - 1)); if (err < 0) return NULL; /* now locked ... */ if (bitmap->bp[page].hijacked) { /* hijacked pointer */ /* should we use the first or second counter field * of the hijacked pointer? */ int hi = (pageoff > PAGE_COUNTER_MASK); return &((bitmap_counter_t *) &bitmap->bp[page].map)[hi]; } else /* page is allocated */ return (bitmap_counter_t *) &(bitmap->bp[page].map[pageoff]); } static int bitmap_startwrite(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; while (sectors) { sector_t blocks; bitmap_counter_t *bmc; spin_lock_irq(&bitmap->counts.lock); bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); return 0; } if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { DEFINE_WAIT(__wait); /* note that it is safe to do the prepare_to_wait * after the test as long as we do it before dropping * the spinlock. */ prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->counts.lock); schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } switch (*bmc) { case 0: md_bitmap_file_set_bit(bitmap, offset); md_bitmap_count_page(&bitmap->counts, offset, 1); fallthrough; case 1: *bmc = 2; } (*bmc)++; spin_unlock_irq(&bitmap->counts.lock); offset += blocks; if (sectors > blocks) sectors -= blocks; else sectors = 0; } return 0; } static void bitmap_endwrite(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; while (sectors) { sector_t blocks; unsigned long flags; bitmap_counter_t *bmc; spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); if (!bmc) { spin_unlock_irqrestore(&bitmap->counts.lock, flags); return; } if (!bitmap->mddev->degraded) { if (bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; sysfs_notify_dirent_safe( bitmap->sysfs_can_clear); } } else if (!NEEDED(*bmc)) { *bmc |= NEEDED_MASK; } if (COUNTER(*bmc) == COUNTER_MAX) wake_up(&bitmap->overflow_wait); (*bmc)--; if (*bmc <= 2) { md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } spin_unlock_irqrestore(&bitmap->counts.lock, flags); offset += blocks; if (sectors > blocks) sectors -= blocks; else sectors = 0; } } static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; bool rv; if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ *blocks = 1024; return true; /* always resync if no bitmap */ } spin_lock_irq(&bitmap->counts.lock); rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ if (RESYNC(*bmc)) { rv = true; } else if (NEEDED(*bmc)) { rv = true; if (!degraded) { /* don't set/clear bits if degraded */ *bmc |= RESYNC_MASK; *bmc &= ~NEEDED_MASK; } } } spin_unlock_irq(&bitmap->counts.lock); return rv; } static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded) { /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will * get confused. * So call __bitmap_start_sync repeatedly (if needed) until * At least PAGE_SIZE>>9 blocks are covered. * Return the 'or' of the result. */ bool rv = false; sector_t blocks1; *blocks = 0; while (*blocks < (PAGE_SIZE>>9)) { rv |= __bitmap_start_sync(mddev->bitmap, offset, &blocks1, degraded); offset += blocks1; *blocks += blocks1; } return rv; } static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool aborted) { bitmap_counter_t *bmc; unsigned long flags; if (bitmap == NULL) { *blocks = 1024; return; } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) goto unlock; /* locked */ if (RESYNC(*bmc)) { *bmc &= ~RESYNC_MASK; if (!NEEDED(*bmc) && aborted) *bmc |= NEEDED_MASK; else { if (*bmc <= 2) { md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } } } unlock: spin_unlock_irqrestore(&bitmap->counts.lock, flags); } static void bitmap_end_sync(struct mddev *mddev, sector_t offset, sector_t *blocks) { __bitmap_end_sync(mddev->bitmap, offset, blocks, true); } static void bitmap_close_sync(struct mddev *mddev) { /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the * RESYNC bit wherever it is still on */ sector_t sector = 0; sector_t blocks; struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; while (sector < bitmap->mddev->resync_max_sectors) { __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } } static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, bool force) { sector_t s = 0; sector_t blocks; struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; if (sector == 0) { bitmap->last_end_sync = jiffies; return; } if (!force && time_before(jiffies, (bitmap->last_end_sync + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); bitmap->mddev->curr_resync_completed = sector; set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { __bitmap_end_sync(bitmap, s, &blocks, false); s += blocks; } bitmap->last_end_sync = jiffies; sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } static void bitmap_sync_with_cluster(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi) { struct bitmap *bitmap = mddev->bitmap; sector_t sector, blocks = 0; for (sector = old_lo; sector < new_lo; ) { __bitmap_end_sync(bitmap, sector, &blocks, false); sector += blocks; } WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); for (sector = old_hi; sector < new_hi; ) { bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); } static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the * counter to 2 and possibly set resync_needed. They should all * be 0 at this point */ sector_t secs; bitmap_counter_t *bmc; spin_lock_irq(&bitmap->counts.lock); bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); return; } if (!*bmc) { *bmc = 2; md_bitmap_count_page(&bitmap->counts, offset, 1); md_bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } if (needed) *bmc |= NEEDED_MASK; spin_unlock_irq(&bitmap->counts.lock); } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, unsigned long e) { unsigned long chunk; struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->recovery_cp) /* We are asserting that the array is dirty, * so move the recovery_cp address back so * that it is obvious that it is dirty */ bitmap->mddev->recovery_cp = sec; } } static void bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; long sleep; if (!bitmap) /* there was no bitmap */ return; /* run the daemon_work three time to ensure everything is flushed * that can be */ sleep = mddev->bitmap_info.daemon_sleep * 2; bitmap->daemon_lastrun -= sleep; bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; bitmap_daemon_work(mddev); if (mddev->bitmap_info.external) md_super_wait(mddev); bitmap_update_sb(bitmap); } static void md_bitmap_free(void *data) { unsigned long k, pages; struct bitmap_page *bp; struct bitmap *bitmap = data; if (!bitmap) /* there was no bitmap */ return; if (bitmap->sysfs_can_clear) sysfs_put(bitmap->sysfs_can_clear); if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); /* Shouldn't be needed - but just in case.... */ wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes) == 0); /* release the bitmap file */ md_bitmap_file_unmap(&bitmap->storage); bp = bitmap->counts.bp; pages = bitmap->counts.pages; /* free all allocated memory */ if (bp) /* deallocate the page memory */ for (k = 0; k < pages; k++) if (bp[k].map && !bp[k].hijacked) kfree(bp[k].map); kfree(bp); kfree(bitmap); } static void bitmap_start_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; int bw; if (!bitmap) return; atomic_inc(&bitmap->behind_writes); bw = atomic_read(&bitmap->behind_writes); if (bw > bitmap->behind_writes_used) bitmap->behind_writes_used = bw; pr_debug("inc write-behind count %d/%lu\n", bw, bitmap->mddev->bitmap_info.max_write_behind); } static void bitmap_end_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); pr_debug("dec write-behind count %d/%lu\n", atomic_read(&bitmap->behind_writes), bitmap->mddev->bitmap_info.max_write_behind); } static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; /* wait for behind writes to complete */ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { pr_debug("md:%s: behind writes in progress - waiting to stop.\n", mdname(mddev)); /* need to kick something here to make sure I/O goes? */ wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); } } static void bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } /* * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set */ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; struct file *file = mddev->bitmap_info.file; int err; struct kernfs_node *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); BUG_ON(file && mddev->bitmap_info.offset); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { pr_notice("md/raid:%s: array with journal cannot have bitmap\n", mdname(mddev)); return ERR_PTR(-EBUSY); } bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); spin_lock_init(&bitmap->counts.lock); atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; bitmap->cluster_slot = slot; if (mddev->kobj.sd) bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); if (bm) { bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); sysfs_put(bm); } else bitmap->sysfs_can_clear = NULL; bitmap->storage.file = file; if (file) { get_file(file); /* As future accesses to this file will use bmap, * and bypass the page cache, we must sync the file * first. */ vfs_fsync(file, 1); } /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ if (!mddev->bitmap_info.external) { /* * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is * instructing us to create a new on-disk bitmap instance. */ if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) err = md_bitmap_new_disk_sb(bitmap); else err = md_bitmap_read_sb(bitmap); } else { err = 0; if (mddev->bitmap_info.chunksize == 0 || mddev->bitmap_info.daemon_sleep == 0) /* chunksize and time_base need to be * set first. */ err = -EINVAL; } if (err) goto error; bitmap->daemon_lastrun = jiffies; err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, true); if (err) goto error; pr_debug("created bitmap (%lu pages) for device %s\n", bitmap->counts.pages, bmname(bitmap)); err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; if (err) goto error; return bitmap; error: md_bitmap_free(bitmap); return ERR_PTR(err); } static int bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap = __bitmap_create(mddev, slot); if (IS_ERR(bitmap)) return PTR_ERR(bitmap); mddev->bitmap = bitmap; return 0; } static int bitmap_load(struct mddev *mddev) { int err = 0; sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; struct md_rdev *rdev; if (!bitmap) goto out; rdev_for_each(rdev, mddev) mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, * so we should forget old cached info. * All chunks should be clean, but some might need_sync. */ while (sector < mddev->resync_max_sectors) { sector_t blocks; bitmap_start_sync(mddev, sector, &blocks, false); sector += blocks; } bitmap_close_sync(mddev); if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a * re-add of a missing device */ start = mddev->recovery_cp; mutex_lock(&mddev->bitmap_info.mutex); err = md_bitmap_init_from_disk(bitmap, start); mutex_unlock(&mddev->bitmap_info.mutex); if (err) goto out; clear_bit(BITMAP_STALE, &bitmap->flags); /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); bitmap_update_sb(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) err = -EIO; out: return err; } /* caller need to free returned bitmap with md_bitmap_free() */ static void *bitmap_get_from_slot(struct mddev *mddev, int slot) { int rv = 0; struct bitmap *bitmap; bitmap = __bitmap_create(mddev, slot); if (IS_ERR(bitmap)) { rv = PTR_ERR(bitmap); return ERR_PTR(rv); } rv = md_bitmap_init_from_disk(bitmap, 0); if (rv) { md_bitmap_free(bitmap); return ERR_PTR(rv); } return bitmap; } /* Loads the bitmap associated with slot and copies the resync information * to our bitmap */ static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, sector_t *high, bool clear_bits) { int rv = 0, i, j; sector_t block, lo = 0, hi = 0; struct bitmap_counts *counts; struct bitmap *bitmap; bitmap = bitmap_get_from_slot(mddev, slot); if (IS_ERR(bitmap)) { pr_err("%s can't get bitmap from slot %d\n", __func__, slot); return -1; } counts = &bitmap->counts; for (j = 0; j < counts->chunks; j++) { block = (sector_t)j << counts->chunkshift; if (md_bitmap_file_test_bit(bitmap, block)) { if (!lo) lo = block; hi = block; md_bitmap_file_clear_bit(bitmap, block); md_bitmap_set_memory_bits(mddev->bitmap, block, 1); md_bitmap_file_set_bit(mddev->bitmap, block); } } if (clear_bits) { bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); __bitmap_unplug(bitmap); } __bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; md_bitmap_free(bitmap); return rv; } static void bitmap_set_pages(void *data, unsigned long pages) { struct bitmap *bitmap = data; bitmap->counts.pages = pages; } static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) { struct bitmap_storage *storage; struct bitmap_counts *counts; struct bitmap *bitmap = data; bitmap_super_t *sb; if (!bitmap) return -ENOENT; if (!bitmap->mddev->bitmap_info.external && !bitmap->storage.sb_page) return -EINVAL; sb = kmap_local_page(bitmap->storage.sb_page); stats->sync_size = le64_to_cpu(sb->sync_size); kunmap_local(sb); counts = &bitmap->counts; stats->missing_pages = counts->missing_pages; stats->pages = counts->pages; storage = &bitmap->storage; stats->file_pages = storage->file_pages; stats->file = storage->file; stats->behind_writes = atomic_read(&bitmap->behind_writes); stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait); stats->events_cleared = bitmap->events_cleared; return 0; } static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init) { /* If chunk_size is 0, choose an appropriate chunk size. * Then possibly allocate new storage space. * Then quiesce, copy bits, replace bitmap, and re-start * * This function is called both to set up the initial bitmap * and to resize the bitmap while the array is active. * If this happens as a result of the array being resized, * chunksize will be zero, and we need to choose a suitable * chunksize, otherwise we use what we are given. */ struct bitmap_storage store; struct bitmap_counts old_counts; unsigned long chunks; sector_t block; sector_t old_blocks, new_blocks; int chunkshift; int ret = 0; long pages; struct bitmap_page *new_bp; if (bitmap->storage.file && !init) { pr_info("md: cannot resize file-based bitmap\n"); return -EINVAL; } if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space. */ long bytes; long space = bitmap->mddev->bitmap_info.space; if (space == 0) { /* We don't know how much space there is, so limit * to current size - in sectors. */ bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); space = DIV_ROUND_UP(bytes, 512); bitmap->mddev->bitmap_info.space = space; } chunkshift = bitmap->counts.chunkshift; chunkshift--; do { /* 'chunkshift' is shift from block size to chunk size */ chunkshift++; chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); bytes = DIV_ROUND_UP(chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); } else chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); memset(&store, 0, sizeof(store)); if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) ret = md_bitmap_storage_alloc(&store, chunks, !bitmap->mddev->bitmap_info.external, mddev_is_clustered(bitmap->mddev) ? bitmap->cluster_slot : 0); if (ret) { md_bitmap_file_unmap(&store); goto err; } pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); new_bp = kcalloc(pages, sizeof(*new_bp), GFP_KERNEL); ret = -ENOMEM; if (!new_bp) { md_bitmap_file_unmap(&store); goto err; } if (!init) bitmap->mddev->pers->quiesce(bitmap->mddev, 1); store.file = bitmap->storage.file; bitmap->storage.file = NULL; if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; old_counts = bitmap->counts; bitmap->counts.bp = new_bp; bitmap->counts.pages = pages; bitmap->counts.missing_pages = pages; bitmap->counts.chunkshift = chunkshift; bitmap->counts.chunks = chunks; bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + BITMAP_BLOCK_SHIFT); blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; for (page = 0; page < pages; page++) { ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1); if (ret) { unsigned long k; /* deallocate the page memory */ for (k = 0; k < page; k++) { kfree(new_bp[k].map); } kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; bitmap->counts.pages = old_counts.pages; bitmap->counts.missing_pages = old_counts.pages; bitmap->counts.chunkshift = old_counts.chunkshift; bitmap->counts.chunks = old_counts.chunks; bitmap->mddev->bitmap_info.chunksize = 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); blocks = old_counts.chunks << old_counts.chunkshift; pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); break; } else bitmap->counts.bp[page].count += 1; } } for (block = 0; block < blocks; ) { bitmap_counter_t *bmc_old, *bmc_new; int set; bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0); set = bmc_old && NEEDED(*bmc_old); if (set) { bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); if (bmc_new) { if (*bmc_new == 0) { /* need to set on-disk bits too. */ sector_t end = block + new_blocks; sector_t start = block >> chunkshift; start <<= chunkshift; while (start < end) { md_bitmap_file_set_bit(bitmap, block); start += 1 << chunkshift; } *bmc_new = 2; md_bitmap_count_page(&bitmap->counts, block, 1); md_bitmap_set_pending(&bitmap->counts, block); } *bmc_new |= NEEDED_MASK; } if (new_blocks < old_blocks) old_blocks = new_blocks; } block += old_blocks; } if (bitmap->counts.bp != old_counts.bp) { unsigned long k; for (k = 0; k < old_counts.pages; k++) if (!old_counts.bp[k].hijacked) kfree(old_counts.bp[k].map); kfree(old_counts.bp); } if (!init) { int i; while (block < (chunks << chunkshift)) { bitmap_counter_t *bmc; bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); if (bmc) { /* new space. It needs to be resynced, so * we set NEEDED_MASK. */ if (*bmc == 0) { *bmc = NEEDED_MASK | 2; md_bitmap_count_page(&bitmap->counts, block, 1); md_bitmap_set_pending(&bitmap->counts, block); } } block += new_blocks; } for (i = 0; i < bitmap->storage.file_pages; i++) set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); } spin_unlock_irq(&bitmap->counts.lock); if (!init) { __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); } ret = 0; err: return ret; } static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, bool init) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; return __bitmap_resize(bitmap, blocks, chunksize, init); } static ssize_t location_show(struct mddev *mddev, char *page) { ssize_t len; if (mddev->bitmap_info.file) len = sprintf(page, "file"); else if (mddev->bitmap_info.offset) len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); else len = sprintf(page, "none"); len += sprintf(page+len, "\n"); return len; } static ssize_t location_store(struct mddev *mddev, const char *buf, size_t len) { int rv; rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers) { if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; goto out; } } if (mddev->bitmap || mddev->bitmap_info.file || mddev->bitmap_info.offset) { /* bitmap already configured. Only option is to clear it */ if (strncmp(buf, "none", 4) != 0) { rv = -EBUSY; goto out; } bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; mddev->bitmap_info.file = NULL; fput(f); } } else { /* No bitmap, OK to set a location */ long long offset; if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; else if (strncmp(buf, "file:", 5) == 0) { /* Not supported yet */ rv = -EINVAL; goto out; } else { if (buf[0] == '+') rv = kstrtoll(buf+1, 10, &offset); else rv = kstrtoll(buf, 10, &offset); if (rv) goto out; if (offset == 0) { rv = -EINVAL; goto out; } if (mddev->bitmap_info.external == 0 && mddev->major_version == 0 && offset != mddev->bitmap_info.default_offset) { rv = -EINVAL; goto out; } mddev->bitmap_info.offset = offset; rv = bitmap_create(mddev, -1); if (rv) goto out; rv = bitmap_load(mddev); if (rv) { mddev->bitmap_info.offset = 0; bitmap_destroy(mddev); goto out; } } } if (!mddev->external) { /* Ensure new bitmap info is stored in * metadata promptly. */ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } rv = 0; out: mddev_unlock_and_resume(mddev); if (rv) return rv; return len; } static struct md_sysfs_entry bitmap_location = __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); /* 'bitmap/space' is the space available at 'location' for the * bitmap. This allows the kernel to know when it is safe to * resize the bitmap to match a resized array. */ static ssize_t space_show(struct mddev *mddev, char *page) { return sprintf(page, "%lu\n", mddev->bitmap_info.space); } static ssize_t space_store(struct mddev *mddev, const char *buf, size_t len) { struct bitmap *bitmap; unsigned long sectors; int rv; rv = kstrtoul(buf, 10, &sectors); if (rv) return rv; if (sectors == 0) return -EINVAL; bitmap = mddev->bitmap; if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9) return -EFBIG; /* Bitmap is too big for this small space */ /* could make sure it isn't too big, but that isn't really * needed - user-space should be careful. */ mddev->bitmap_info.space = sectors; return len; } static struct md_sysfs_entry bitmap_space = __ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); static ssize_t timeout_show(struct mddev *mddev, char *page) { ssize_t len; unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; len = sprintf(page, "%lu", secs); if (jifs) len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); len += sprintf(page+len, "\n"); return len; } static ssize_t timeout_store(struct mddev *mddev, const char *buf, size_t len) { /* timeout can be set at any time */ unsigned long timeout; int rv = strict_strtoul_scaled(buf, &timeout, 4); if (rv) return rv; /* just to make sure we don't overflow... */ if (timeout >= LONG_MAX / HZ) return -EINVAL; timeout = timeout * HZ / 10000; if (timeout >= MAX_SCHEDULE_TIMEOUT) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; mddev->bitmap_info.daemon_sleep = timeout; mddev_set_timeout(mddev, timeout, false); md_wakeup_thread(mddev->thread); return len; } static struct md_sysfs_entry bitmap_timeout = __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); static ssize_t backlog_show(struct mddev *mddev, char *page) { return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); } static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; unsigned long old_mwb = mddev->bitmap_info.max_write_behind; struct md_rdev *rdev; bool has_write_mostly = false; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; rv = mddev_suspend_and_lock(mddev); if (rv) return rv; /* * Without write mostly device, it doesn't make sense to set * backlog for max_write_behind. */ rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags)) { has_write_mostly = true; break; } } if (!has_write_mostly) { pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", mdname(mddev)); mddev_unlock(mddev); return -EINVAL; } mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ if (!mddev->serialize_policy) mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ rdev_for_each(rdev, mddev) mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) bitmap_update_sb(mddev->bitmap); mddev_unlock_and_resume(mddev); return len; } static struct md_sysfs_entry bitmap_backlog = __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); static ssize_t chunksize_show(struct mddev *mddev, char *page) { return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); } static ssize_t chunksize_store(struct mddev *mddev, const char *buf, size_t len) { /* Can only be changed when no bitmap is active */ int rv; unsigned long csize; if (mddev->bitmap) return -EBUSY; rv = kstrtoul(buf, 10, &csize); if (rv) return rv; if (csize < 512 || !is_power_of_2(csize)) return -EINVAL; if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize)))) return -EOVERFLOW; mddev->bitmap_info.chunksize = csize; return len; } static struct md_sysfs_entry bitmap_chunksize = __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); static ssize_t metadata_show(struct mddev *mddev, char *page) { if (mddev_is_clustered(mddev)) return sprintf(page, "clustered\n"); return sprintf(page, "%s\n", (mddev->bitmap_info.external ? "external" : "internal")); } static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) { if (mddev->bitmap || mddev->bitmap_info.file || mddev->bitmap_info.offset) return -EBUSY; if (strncmp(buf, "external", 8) == 0) mddev->bitmap_info.external = 1; else if ((strncmp(buf, "internal", 8) == 0) || (strncmp(buf, "clustered", 9) == 0)) mddev->bitmap_info.external = 0; else return -EINVAL; return len; } static struct md_sysfs_entry bitmap_metadata = __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; struct bitmap *bitmap; spin_lock(&mddev->lock); bitmap = mddev->bitmap; if (bitmap) len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" : "true")); else len = sprintf(page, "\n"); spin_unlock(&mddev->lock); return len; } static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return -ENOENT; if (strncmp(buf, "false", 5) == 0) { bitmap->need_sync = 1; return len; } if (strncmp(buf, "true", 4) == 0) { if (mddev->degraded) return -EBUSY; bitmap->need_sync = 0; return len; } return -EINVAL; } static struct md_sysfs_entry bitmap_can_clear = __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { ssize_t ret; struct bitmap *bitmap; spin_lock(&mddev->lock); bitmap = mddev->bitmap; if (!bitmap) ret = sprintf(page, "0\n"); else ret = sprintf(page, "%lu\n", bitmap->behind_writes_used); spin_unlock(&mddev->lock); return ret; } static ssize_t behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) { struct bitmap *bitmap = mddev->bitmap; if (bitmap) bitmap->behind_writes_used = 0; return len; } static struct md_sysfs_entry max_backlog_used = __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, behind_writes_used_show, behind_writes_used_reset); static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, &bitmap_space.attr, &bitmap_timeout.attr, &bitmap_backlog.attr, &bitmap_chunksize.attr, &bitmap_metadata.attr, &bitmap_can_clear.attr, &max_backlog_used.attr, NULL }; const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; static struct bitmap_operations bitmap_ops = { .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, .load = bitmap_load, .destroy = bitmap_destroy, .flush = bitmap_flush, .write_all = bitmap_write_all, .dirty_bits = bitmap_dirty_bits, .unplug = bitmap_unplug, .daemon_work = bitmap_daemon_work, .start_behind_write = bitmap_start_behind_write, .end_behind_write = bitmap_end_behind_write, .wait_behind_writes = bitmap_wait_behind_writes, .startwrite = bitmap_startwrite, .endwrite = bitmap_endwrite, .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, .close_sync = bitmap_close_sync, .update_sb = bitmap_update_sb, .get_stats = bitmap_get_stats, .sync_with_cluster = bitmap_sync_with_cluster, .get_from_slot = bitmap_get_from_slot, .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, }; void mddev_set_bitmap_ops(struct mddev *mddev) { mddev->bitmap_ops = &bitmap_ops; }
1 4 8 5 31 2 10 4 33 33 26 8 2 31 32 23 23 32 33 4 29 33 1 1 33 33 33 6 1 5 3 2 18 18 18 11 3 1 8 1 6 7 2 6 1 2 4 5 12 7 7 3 25 6 13 5 1 9 59 10 1 1 1 1 9 2 2 7 7 4 2 6 2 18 9 11 9 3 17 4 13 3 3 3 1 2 5 1 4 40 40 28 2 8 13 29 40 40 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 // SPDX-License-Identifier: GPL-2.0 /* * The USB Monitor, inspired by Dave Harding's USBMon. * * This is a binary format reader. * * Copyright (C) 2006 Paolo Abeni (paolo.abeni@email.it) * Copyright (C) 2006,2007 Pete Zaitcev (zaitcev@redhat.com) */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/fs.h> #include <linux/cdev.h> #include <linux/export.h> #include <linux/usb.h> #include <linux/poll.h> #include <linux/compat.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/time64.h> #include <linux/uaccess.h> #include "usb_mon.h" /* * Defined by USB 2.0 clause 9.3, table 9.2. */ #define SETUP_LEN 8 /* ioctl macros */ #define MON_IOC_MAGIC 0x92 #define MON_IOCQ_URB_LEN _IO(MON_IOC_MAGIC, 1) /* #2 used to be MON_IOCX_URB, removed before it got into Linus tree */ #define MON_IOCG_STATS _IOR(MON_IOC_MAGIC, 3, struct mon_bin_stats) #define MON_IOCT_RING_SIZE _IO(MON_IOC_MAGIC, 4) #define MON_IOCQ_RING_SIZE _IO(MON_IOC_MAGIC, 5) #define MON_IOCX_GET _IOW(MON_IOC_MAGIC, 6, struct mon_bin_get) #define MON_IOCX_MFETCH _IOWR(MON_IOC_MAGIC, 7, struct mon_bin_mfetch) #define MON_IOCH_MFLUSH _IO(MON_IOC_MAGIC, 8) /* #9 was MON_IOCT_SETAPI */ #define MON_IOCX_GETX _IOW(MON_IOC_MAGIC, 10, struct mon_bin_get) #ifdef CONFIG_COMPAT #define MON_IOCX_GET32 _IOW(MON_IOC_MAGIC, 6, struct mon_bin_get32) #define MON_IOCX_MFETCH32 _IOWR(MON_IOC_MAGIC, 7, struct mon_bin_mfetch32) #define MON_IOCX_GETX32 _IOW(MON_IOC_MAGIC, 10, struct mon_bin_get32) #endif /* * Some architectures have enormous basic pages (16KB for ia64, 64KB for ppc). * But it's all right. Just use a simple way to make sure the chunk is never * smaller than a page. * * N.B. An application does not know our chunk size. * * Woops, get_zeroed_page() returns a single page. I guess we're stuck with * page-sized chunks for the time being. */ #define CHUNK_SIZE PAGE_SIZE #define CHUNK_ALIGN(x) (((x)+CHUNK_SIZE-1) & ~(CHUNK_SIZE-1)) /* * The magic limit was calculated so that it allows the monitoring * application to pick data once in two ticks. This way, another application, * which presumably drives the bus, gets to hog CPU, yet we collect our data. * If HZ is 100, a 480 mbit/s bus drives 614 KB every jiffy. USB has an * enormous overhead built into the bus protocol, so we need about 1000 KB. * * This is still too much for most cases, where we just snoop a few * descriptor fetches for enumeration. So, the default is a "reasonable" * amount for systems with HZ=250 and incomplete bus saturation. * * XXX What about multi-megabyte URBs which take minutes to transfer? */ #define BUFF_MAX CHUNK_ALIGN(1200*1024) #define BUFF_DFL CHUNK_ALIGN(300*1024) #define BUFF_MIN CHUNK_ALIGN(8*1024) /* * The per-event API header (2 per URB). * * This structure is seen in userland as defined by the documentation. */ struct mon_bin_hdr { u64 id; /* URB ID - from submission to callback */ unsigned char type; /* Same as in text API; extensible. */ unsigned char xfer_type; /* ISO, Intr, Control, Bulk */ unsigned char epnum; /* Endpoint number and transfer direction */ unsigned char devnum; /* Device address */ unsigned short busnum; /* Bus number */ char flag_setup; char flag_data; s64 ts_sec; /* ktime_get_real_ts64 */ s32 ts_usec; /* ktime_get_real_ts64 */ int status; unsigned int len_urb; /* Length of data (submitted or actual) */ unsigned int len_cap; /* Delivered length */ union { unsigned char setup[SETUP_LEN]; /* Only for Control S-type */ struct iso_rec { int error_count; int numdesc; } iso; } s; int interval; int start_frame; unsigned int xfer_flags; unsigned int ndesc; /* Actual number of ISO descriptors */ }; /* * ISO vector, packed into the head of data stream. * This has to take 16 bytes to make sure that the end of buffer * wrap is not happening in the middle of a descriptor. */ struct mon_bin_isodesc { int iso_status; unsigned int iso_off; unsigned int iso_len; u32 _pad; }; /* per file statistic */ struct mon_bin_stats { u32 queued; u32 dropped; }; struct mon_bin_get { struct mon_bin_hdr __user *hdr; /* Can be 48 bytes or 64. */ void __user *data; size_t alloc; /* Length of data (can be zero) */ }; struct mon_bin_mfetch { u32 __user *offvec; /* Vector of events fetched */ u32 nfetch; /* Number of events to fetch (out: fetched) */ u32 nflush; /* Number of events to flush */ }; #ifdef CONFIG_COMPAT struct mon_bin_get32 { u32 hdr32; u32 data32; u32 alloc32; }; struct mon_bin_mfetch32 { u32 offvec32; u32 nfetch32; u32 nflush32; }; #endif /* Having these two values same prevents wrapping of the mon_bin_hdr */ #define PKT_ALIGN 64 #define PKT_SIZE 64 #define PKT_SZ_API0 48 /* API 0 (2.6.20) size */ #define PKT_SZ_API1 64 /* API 1 size: extra fields */ #define ISODESC_MAX 128 /* Same number as usbfs allows, 2048 bytes. */ /* max number of USB bus supported */ #define MON_BIN_MAX_MINOR 128 /* * The buffer: map of used pages. */ struct mon_pgmap { struct page *pg; unsigned char *ptr; /* XXX just use page_to_virt everywhere? */ }; /* * This gets associated with an open file struct. */ struct mon_reader_bin { /* The buffer: one per open. */ spinlock_t b_lock; /* Protect b_cnt, b_in */ unsigned int b_size; /* Current size of the buffer - bytes */ unsigned int b_cnt; /* Bytes used */ unsigned int b_in, b_out; /* Offsets into buffer - bytes */ unsigned int b_read; /* Amount of read data in curr. pkt. */ struct mon_pgmap *b_vec; /* The map array */ wait_queue_head_t b_wait; /* Wait for data here */ struct mutex fetch_lock; /* Protect b_read, b_out */ int mmap_active; /* A list of these is needed for "bus 0". Some time later. */ struct mon_reader r; /* Stats */ unsigned int cnt_lost; }; static inline struct mon_bin_hdr *MON_OFF2HDR(const struct mon_reader_bin *rp, unsigned int offset) { return (struct mon_bin_hdr *) (rp->b_vec[offset / CHUNK_SIZE].ptr + offset % CHUNK_SIZE); } #define MON_RING_EMPTY(rp) ((rp)->b_cnt == 0) static unsigned char xfer_to_pipe[4] = { PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT }; static const struct class mon_bin_class = { .name = "usbmon", }; static dev_t mon_bin_dev0; static struct cdev mon_bin_cdev; static void mon_buff_area_fill(const struct mon_reader_bin *rp, unsigned int offset, unsigned int size); static int mon_bin_wait_event(struct file *file, struct mon_reader_bin *rp); static int mon_alloc_buff(struct mon_pgmap *map, int npages); static void mon_free_buff(struct mon_pgmap *map, int npages); /* * This is a "chunked memcpy". It does not manipulate any counters. */ static unsigned int mon_copy_to_buff(const struct mon_reader_bin *this, unsigned int off, const unsigned char *from, unsigned int length) { unsigned int step_len; unsigned char *buf; unsigned int in_page; while (length) { /* * Determine step_len. */ step_len = length; in_page = CHUNK_SIZE - (off & (CHUNK_SIZE-1)); if (in_page < step_len) step_len = in_page; /* * Copy data and advance pointers. */ buf = this->b_vec[off / CHUNK_SIZE].ptr + off % CHUNK_SIZE; memcpy(buf, from, step_len); if ((off += step_len) >= this->b_size) off = 0; from += step_len; length -= step_len; } return off; } /* * This is a little worse than the above because it's "chunked copy_to_user". * The return value is an error code, not an offset. */ static int copy_from_buf(const struct mon_reader_bin *this, unsigned int off, char __user *to, int length) { unsigned int step_len; unsigned char *buf; unsigned int in_page; while (length) { /* * Determine step_len. */ step_len = length; in_page = CHUNK_SIZE - (off & (CHUNK_SIZE-1)); if (in_page < step_len) step_len = in_page; /* * Copy data and advance pointers. */ buf = this->b_vec[off / CHUNK_SIZE].ptr + off % CHUNK_SIZE; if (copy_to_user(to, buf, step_len)) return -EINVAL; if ((off += step_len) >= this->b_size) off = 0; to += step_len; length -= step_len; } return 0; } /* * Allocate an (aligned) area in the buffer. * This is called under b_lock. * Returns ~0 on failure. */ static unsigned int mon_buff_area_alloc(struct mon_reader_bin *rp, unsigned int size) { unsigned int offset; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if (rp->b_cnt + size > rp->b_size) return ~0; offset = rp->b_in; rp->b_cnt += size; if ((rp->b_in += size) >= rp->b_size) rp->b_in -= rp->b_size; return offset; } /* * This is the same thing as mon_buff_area_alloc, only it does not allow * buffers to wrap. This is needed by applications which pass references * into mmap-ed buffers up their stacks (libpcap can do that). * * Currently, we always have the header stuck with the data, although * it is not strictly speaking necessary. * * When a buffer would wrap, we place a filler packet to mark the space. */ static unsigned int mon_buff_area_alloc_contiguous(struct mon_reader_bin *rp, unsigned int size) { unsigned int offset; unsigned int fill_size; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if (rp->b_cnt + size > rp->b_size) return ~0; if (rp->b_in + size > rp->b_size) { /* * This would wrap. Find if we still have space after * skipping to the end of the buffer. If we do, place * a filler packet and allocate a new packet. */ fill_size = rp->b_size - rp->b_in; if (rp->b_cnt + size + fill_size > rp->b_size) return ~0; mon_buff_area_fill(rp, rp->b_in, fill_size); offset = 0; rp->b_in = size; rp->b_cnt += size + fill_size; } else if (rp->b_in + size == rp->b_size) { offset = rp->b_in; rp->b_in = 0; rp->b_cnt += size; } else { offset = rp->b_in; rp->b_in += size; rp->b_cnt += size; } return offset; } /* * Return a few (kilo-)bytes to the head of the buffer. * This is used if a data fetch fails. */ static void mon_buff_area_shrink(struct mon_reader_bin *rp, unsigned int size) { /* size &= ~(PKT_ALIGN-1); -- we're called with aligned size */ rp->b_cnt -= size; if (rp->b_in < size) rp->b_in += rp->b_size; rp->b_in -= size; } /* * This has to be called under both b_lock and fetch_lock, because * it accesses both b_cnt and b_out. */ static void mon_buff_area_free(struct mon_reader_bin *rp, unsigned int size) { size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); rp->b_cnt -= size; if ((rp->b_out += size) >= rp->b_size) rp->b_out -= rp->b_size; } static void mon_buff_area_fill(const struct mon_reader_bin *rp, unsigned int offset, unsigned int size) { struct mon_bin_hdr *ep; ep = MON_OFF2HDR(rp, offset); memset(ep, 0, PKT_SIZE); ep->type = '@'; ep->len_cap = size - PKT_SIZE; } static inline char mon_bin_get_setup(unsigned char *setupb, const struct urb *urb, char ev_type) { if (urb->setup_packet == NULL) return 'Z'; memcpy(setupb, urb->setup_packet, SETUP_LEN); return 0; } static unsigned int mon_bin_get_data(const struct mon_reader_bin *rp, unsigned int offset, struct urb *urb, unsigned int length, char *flag) { int i; struct scatterlist *sg; unsigned int this_len; *flag = 0; if (urb->num_sgs == 0) { if (urb->transfer_buffer == NULL) { *flag = 'Z'; return length; } mon_copy_to_buff(rp, offset, urb->transfer_buffer, length); length = 0; } else { /* If IOMMU coalescing occurred, we cannot trust sg_page */ if (urb->transfer_flags & URB_DMA_SG_COMBINED) { *flag = 'D'; return length; } /* Copy up to the first non-addressable segment */ for_each_sg(urb->sg, sg, urb->num_sgs, i) { if (length == 0 || PageHighMem(sg_page(sg))) break; this_len = min_t(unsigned int, sg->length, length); offset = mon_copy_to_buff(rp, offset, sg_virt(sg), this_len); length -= this_len; } if (i == 0) *flag = 'D'; } return length; } /* * This is the look-ahead pass in case of 'C Zi', when actual_length cannot * be used to determine the length of the whole contiguous buffer. */ static unsigned int mon_bin_collate_isodesc(const struct mon_reader_bin *rp, struct urb *urb, unsigned int ndesc) { struct usb_iso_packet_descriptor *fp; unsigned int length; length = 0; fp = urb->iso_frame_desc; while (ndesc-- != 0) { if (fp->actual_length != 0) { if (fp->offset + fp->actual_length > length) length = fp->offset + fp->actual_length; } fp++; } return length; } static void mon_bin_get_isodesc(const struct mon_reader_bin *rp, unsigned int offset, struct urb *urb, char ev_type, unsigned int ndesc) { struct mon_bin_isodesc *dp; struct usb_iso_packet_descriptor *fp; fp = urb->iso_frame_desc; while (ndesc-- != 0) { dp = (struct mon_bin_isodesc *) (rp->b_vec[offset / CHUNK_SIZE].ptr + offset % CHUNK_SIZE); dp->iso_status = fp->status; dp->iso_off = fp->offset; dp->iso_len = (ev_type == 'S') ? fp->length : fp->actual_length; dp->_pad = 0; if ((offset += sizeof(struct mon_bin_isodesc)) >= rp->b_size) offset = 0; fp++; } } static void mon_bin_event(struct mon_reader_bin *rp, struct urb *urb, char ev_type, int status) { const struct usb_endpoint_descriptor *epd = &urb->ep->desc; struct timespec64 ts; unsigned long flags; unsigned int urb_length; unsigned int offset; unsigned int length; unsigned int delta; unsigned int ndesc, lendesc; unsigned char dir; struct mon_bin_hdr *ep; char data_tag = 0; ktime_get_real_ts64(&ts); spin_lock_irqsave(&rp->b_lock, flags); /* * Find the maximum allowable length, then allocate space. */ urb_length = (ev_type == 'S') ? urb->transfer_buffer_length : urb->actual_length; length = urb_length; if (usb_endpoint_xfer_isoc(epd)) { if (urb->number_of_packets < 0) { ndesc = 0; } else if (urb->number_of_packets >= ISODESC_MAX) { ndesc = ISODESC_MAX; } else { ndesc = urb->number_of_packets; } if (ev_type == 'C' && usb_urb_dir_in(urb)) length = mon_bin_collate_isodesc(rp, urb, ndesc); } else { ndesc = 0; } lendesc = ndesc*sizeof(struct mon_bin_isodesc); /* not an issue unless there's a subtle bug in a HCD somewhere */ if (length >= urb->transfer_buffer_length) length = urb->transfer_buffer_length; if (length >= rp->b_size/5) length = rp->b_size/5; if (usb_urb_dir_in(urb)) { if (ev_type == 'S') { length = 0; data_tag = '<'; } /* Cannot rely on endpoint number in case of control ep.0 */ dir = USB_DIR_IN; } else { if (ev_type == 'C') { length = 0; data_tag = '>'; } dir = 0; } if (rp->mmap_active) { offset = mon_buff_area_alloc_contiguous(rp, length + PKT_SIZE + lendesc); } else { offset = mon_buff_area_alloc(rp, length + PKT_SIZE + lendesc); } if (offset == ~0) { rp->cnt_lost++; spin_unlock_irqrestore(&rp->b_lock, flags); return; } ep = MON_OFF2HDR(rp, offset); if ((offset += PKT_SIZE) >= rp->b_size) offset = 0; /* * Fill the allocated area. */ memset(ep, 0, PKT_SIZE); ep->type = ev_type; ep->xfer_type = xfer_to_pipe[usb_endpoint_type(epd)]; ep->epnum = dir | usb_endpoint_num(epd); ep->devnum = urb->dev->devnum; ep->busnum = urb->dev->bus->busnum; ep->id = (unsigned long) urb; ep->ts_sec = ts.tv_sec; ep->ts_usec = ts.tv_nsec / NSEC_PER_USEC; ep->status = status; ep->len_urb = urb_length; ep->len_cap = length + lendesc; ep->xfer_flags = urb->transfer_flags; if (usb_endpoint_xfer_int(epd)) { ep->interval = urb->interval; } else if (usb_endpoint_xfer_isoc(epd)) { ep->interval = urb->interval; ep->start_frame = urb->start_frame; ep->s.iso.error_count = urb->error_count; ep->s.iso.numdesc = urb->number_of_packets; } if (usb_endpoint_xfer_control(epd) && ev_type == 'S') { ep->flag_setup = mon_bin_get_setup(ep->s.setup, urb, ev_type); } else { ep->flag_setup = '-'; } if (ndesc != 0) { ep->ndesc = ndesc; mon_bin_get_isodesc(rp, offset, urb, ev_type, ndesc); if ((offset += lendesc) >= rp->b_size) offset -= rp->b_size; } if (length != 0) { length = mon_bin_get_data(rp, offset, urb, length, &ep->flag_data); if (length > 0) { delta = (ep->len_cap + PKT_ALIGN-1) & ~(PKT_ALIGN-1); ep->len_cap -= length; delta -= (ep->len_cap + PKT_ALIGN-1) & ~(PKT_ALIGN-1); mon_buff_area_shrink(rp, delta); } } else { ep->flag_data = data_tag; } spin_unlock_irqrestore(&rp->b_lock, flags); wake_up(&rp->b_wait); } static void mon_bin_submit(void *data, struct urb *urb) { struct mon_reader_bin *rp = data; mon_bin_event(rp, urb, 'S', -EINPROGRESS); } static void mon_bin_complete(void *data, struct urb *urb, int status) { struct mon_reader_bin *rp = data; mon_bin_event(rp, urb, 'C', status); } static void mon_bin_error(void *data, struct urb *urb, int error) { struct mon_reader_bin *rp = data; struct timespec64 ts; unsigned long flags; unsigned int offset; struct mon_bin_hdr *ep; ktime_get_real_ts64(&ts); spin_lock_irqsave(&rp->b_lock, flags); offset = mon_buff_area_alloc(rp, PKT_SIZE); if (offset == ~0) { /* Not incrementing cnt_lost. Just because. */ spin_unlock_irqrestore(&rp->b_lock, flags); return; } ep = MON_OFF2HDR(rp, offset); memset(ep, 0, PKT_SIZE); ep->type = 'E'; ep->xfer_type = xfer_to_pipe[usb_endpoint_type(&urb->ep->desc)]; ep->epnum = usb_urb_dir_in(urb) ? USB_DIR_IN : 0; ep->epnum |= usb_endpoint_num(&urb->ep->desc); ep->devnum = urb->dev->devnum; ep->busnum = urb->dev->bus->busnum; ep->id = (unsigned long) urb; ep->ts_sec = ts.tv_sec; ep->ts_usec = ts.tv_nsec / NSEC_PER_USEC; ep->status = error; ep->flag_setup = '-'; ep->flag_data = 'E'; spin_unlock_irqrestore(&rp->b_lock, flags); wake_up(&rp->b_wait); } static int mon_bin_open(struct inode *inode, struct file *file) { struct mon_bus *mbus; struct mon_reader_bin *rp; size_t size; int rc; mutex_lock(&mon_lock); mbus = mon_bus_lookup(iminor(inode)); if (mbus == NULL) { mutex_unlock(&mon_lock); return -ENODEV; } if (mbus != &mon_bus0 && mbus->u_bus == NULL) { printk(KERN_ERR TAG ": consistency error on open\n"); mutex_unlock(&mon_lock); return -ENODEV; } rp = kzalloc(sizeof(struct mon_reader_bin), GFP_KERNEL); if (rp == NULL) { rc = -ENOMEM; goto err_alloc; } spin_lock_init(&rp->b_lock); init_waitqueue_head(&rp->b_wait); mutex_init(&rp->fetch_lock); rp->b_size = BUFF_DFL; size = sizeof(struct mon_pgmap) * (rp->b_size/CHUNK_SIZE); if ((rp->b_vec = kzalloc(size, GFP_KERNEL)) == NULL) { rc = -ENOMEM; goto err_allocvec; } if ((rc = mon_alloc_buff(rp->b_vec, rp->b_size/CHUNK_SIZE)) < 0) goto err_allocbuff; rp->r.m_bus = mbus; rp->r.r_data = rp; rp->r.rnf_submit = mon_bin_submit; rp->r.rnf_error = mon_bin_error; rp->r.rnf_complete = mon_bin_complete; mon_reader_add(mbus, &rp->r); file->private_data = rp; mutex_unlock(&mon_lock); return 0; err_allocbuff: kfree(rp->b_vec); err_allocvec: kfree(rp); err_alloc: mutex_unlock(&mon_lock); return rc; } /* * Extract an event from buffer and copy it to user space. * Wait if there is no event ready. * Returns zero or error. */ static int mon_bin_get_event(struct file *file, struct mon_reader_bin *rp, struct mon_bin_hdr __user *hdr, unsigned int hdrbytes, void __user *data, unsigned int nbytes) { unsigned long flags; struct mon_bin_hdr *ep; size_t step_len; unsigned int offset; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } ep = MON_OFF2HDR(rp, rp->b_out); if (copy_to_user(hdr, ep, hdrbytes)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } step_len = min(ep->len_cap, nbytes); if ((offset = rp->b_out + PKT_SIZE) >= rp->b_size) offset = 0; if (copy_from_buf(rp, offset, data, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } spin_lock_irqsave(&rp->b_lock, flags); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; mutex_unlock(&rp->fetch_lock); return 0; } static int mon_bin_release(struct inode *inode, struct file *file) { struct mon_reader_bin *rp = file->private_data; struct mon_bus* mbus = rp->r.m_bus; mutex_lock(&mon_lock); if (mbus->nreaders <= 0) { printk(KERN_ERR TAG ": consistency error on close\n"); mutex_unlock(&mon_lock); return 0; } mon_reader_del(mbus, &rp->r); mon_free_buff(rp->b_vec, rp->b_size/CHUNK_SIZE); kfree(rp->b_vec); kfree(rp); mutex_unlock(&mon_lock); return 0; } static ssize_t mon_bin_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { struct mon_reader_bin *rp = file->private_data; unsigned int hdrbytes = PKT_SZ_API0; unsigned long flags; struct mon_bin_hdr *ep; unsigned int offset; size_t step_len; char *ptr; ssize_t done = 0; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } ep = MON_OFF2HDR(rp, rp->b_out); if (rp->b_read < hdrbytes) { step_len = min_t(size_t, nbytes, hdrbytes - rp->b_read); ptr = ((char *)ep) + rp->b_read; if (step_len && copy_to_user(buf, ptr, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nbytes -= step_len; buf += step_len; rp->b_read += step_len; done += step_len; } if (rp->b_read >= hdrbytes) { step_len = ep->len_cap; step_len -= rp->b_read - hdrbytes; if (step_len > nbytes) step_len = nbytes; offset = rp->b_out + PKT_SIZE; offset += rp->b_read - hdrbytes; if (offset >= rp->b_size) offset -= rp->b_size; if (copy_from_buf(rp, offset, buf, step_len)) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nbytes -= step_len; buf += step_len; rp->b_read += step_len; done += step_len; } /* * Check if whole packet was read, and if so, jump to the next one. */ if (rp->b_read >= hdrbytes + ep->len_cap) { spin_lock_irqsave(&rp->b_lock, flags); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; } mutex_unlock(&rp->fetch_lock); return done; } /* * Remove at most nevents from chunked buffer. * Returns the number of removed events. */ static int mon_bin_flush(struct mon_reader_bin *rp, unsigned nevents) { unsigned long flags; struct mon_bin_hdr *ep; int i; mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); for (i = 0; i < nevents; ++i) { if (MON_RING_EMPTY(rp)) break; ep = MON_OFF2HDR(rp, rp->b_out); mon_buff_area_free(rp, PKT_SIZE + ep->len_cap); } spin_unlock_irqrestore(&rp->b_lock, flags); rp->b_read = 0; mutex_unlock(&rp->fetch_lock); return i; } /* * Fetch at most max event offsets into the buffer and put them into vec. * The events are usually freed later with mon_bin_flush. * Return the effective number of events fetched. */ static int mon_bin_fetch(struct file *file, struct mon_reader_bin *rp, u32 __user *vec, unsigned int max) { unsigned int cur_out; unsigned int bytes, avail; unsigned int size; unsigned int nevents; struct mon_bin_hdr *ep; unsigned long flags; int rc; mutex_lock(&rp->fetch_lock); if ((rc = mon_bin_wait_event(file, rp)) < 0) { mutex_unlock(&rp->fetch_lock); return rc; } spin_lock_irqsave(&rp->b_lock, flags); avail = rp->b_cnt; spin_unlock_irqrestore(&rp->b_lock, flags); cur_out = rp->b_out; nevents = 0; bytes = 0; while (bytes < avail) { if (nevents >= max) break; ep = MON_OFF2HDR(rp, cur_out); if (put_user(cur_out, &vec[nevents])) { mutex_unlock(&rp->fetch_lock); return -EFAULT; } nevents++; size = ep->len_cap + PKT_SIZE; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if ((cur_out += size) >= rp->b_size) cur_out -= rp->b_size; bytes += size; } mutex_unlock(&rp->fetch_lock); return nevents; } /* * Count events. This is almost the same as the above mon_bin_fetch, * only we do not store offsets into user vector, and we have no limit. */ static int mon_bin_queued(struct mon_reader_bin *rp) { unsigned int cur_out; unsigned int bytes, avail; unsigned int size; unsigned int nevents; struct mon_bin_hdr *ep; unsigned long flags; mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); avail = rp->b_cnt; spin_unlock_irqrestore(&rp->b_lock, flags); cur_out = rp->b_out; nevents = 0; bytes = 0; while (bytes < avail) { ep = MON_OFF2HDR(rp, cur_out); nevents++; size = ep->len_cap + PKT_SIZE; size = (size + PKT_ALIGN-1) & ~(PKT_ALIGN-1); if ((cur_out += size) >= rp->b_size) cur_out -= rp->b_size; bytes += size; } mutex_unlock(&rp->fetch_lock); return nevents; } /* */ static long mon_bin_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct mon_reader_bin *rp = file->private_data; // struct mon_bus* mbus = rp->r.m_bus; int ret = 0; struct mon_bin_hdr *ep; unsigned long flags; switch (cmd) { case MON_IOCQ_URB_LEN: /* * N.B. This only returns the size of data, without the header. */ spin_lock_irqsave(&rp->b_lock, flags); if (!MON_RING_EMPTY(rp)) { ep = MON_OFF2HDR(rp, rp->b_out); ret = ep->len_cap; } spin_unlock_irqrestore(&rp->b_lock, flags); break; case MON_IOCQ_RING_SIZE: mutex_lock(&rp->fetch_lock); ret = rp->b_size; mutex_unlock(&rp->fetch_lock); break; case MON_IOCT_RING_SIZE: /* * Changing the buffer size will flush it's contents; the new * buffer is allocated before releasing the old one to be sure * the device will stay functional also in case of memory * pressure. */ { int size; struct mon_pgmap *vec; if (arg < BUFF_MIN || arg > BUFF_MAX) return -EINVAL; size = CHUNK_ALIGN(arg); vec = kcalloc(size / CHUNK_SIZE, sizeof(struct mon_pgmap), GFP_KERNEL); if (vec == NULL) { ret = -ENOMEM; break; } ret = mon_alloc_buff(vec, size/CHUNK_SIZE); if (ret < 0) { kfree(vec); break; } mutex_lock(&rp->fetch_lock); spin_lock_irqsave(&rp->b_lock, flags); if (rp->mmap_active) { mon_free_buff(vec, size/CHUNK_SIZE); kfree(vec); ret = -EBUSY; } else { mon_free_buff(rp->b_vec, rp->b_size/CHUNK_SIZE); kfree(rp->b_vec); rp->b_vec = vec; rp->b_size = size; rp->b_read = rp->b_in = rp->b_out = rp->b_cnt = 0; rp->cnt_lost = 0; } spin_unlock_irqrestore(&rp->b_lock, flags); mutex_unlock(&rp->fetch_lock); } break; case MON_IOCH_MFLUSH: ret = mon_bin_flush(rp, arg); break; case MON_IOCX_GET: case MON_IOCX_GETX: { struct mon_bin_get getb; if (copy_from_user(&getb, (void __user *)arg, sizeof(struct mon_bin_get))) return -EFAULT; if (getb.alloc > 0x10000000) /* Want to cast to u32 */ return -EINVAL; ret = mon_bin_get_event(file, rp, getb.hdr, (cmd == MON_IOCX_GET)? PKT_SZ_API0: PKT_SZ_API1, getb.data, (unsigned int)getb.alloc); } break; case MON_IOCX_MFETCH: { struct mon_bin_mfetch mfetch; struct mon_bin_mfetch __user *uptr; uptr = (struct mon_bin_mfetch __user *)arg; if (copy_from_user(&mfetch, uptr, sizeof(mfetch))) return -EFAULT; if (mfetch.nflush) { ret = mon_bin_flush(rp, mfetch.nflush); if (ret < 0) return ret; if (put_user(ret, &uptr->nflush)) return -EFAULT; } ret = mon_bin_fetch(file, rp, mfetch.offvec, mfetch.nfetch); if (ret < 0) return ret; if (put_user(ret, &uptr->nfetch)) return -EFAULT; ret = 0; } break; case MON_IOCG_STATS: { struct mon_bin_stats __user *sp; unsigned int nevents; unsigned int ndropped; spin_lock_irqsave(&rp->b_lock, flags); ndropped = rp->cnt_lost; rp->cnt_lost = 0; spin_unlock_irqrestore(&rp->b_lock, flags); nevents = mon_bin_queued(rp); sp = (struct mon_bin_stats __user *)arg; if (put_user(ndropped, &sp->dropped)) return -EFAULT; if (put_user(nevents, &sp->queued)) return -EFAULT; } break; default: return -ENOTTY; } return ret; } #ifdef CONFIG_COMPAT static long mon_bin_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct mon_reader_bin *rp = file->private_data; int ret; switch (cmd) { case MON_IOCX_GET32: case MON_IOCX_GETX32: { struct mon_bin_get32 getb; if (copy_from_user(&getb, (void __user *)arg, sizeof(struct mon_bin_get32))) return -EFAULT; ret = mon_bin_get_event(file, rp, compat_ptr(getb.hdr32), (cmd == MON_IOCX_GET32)? PKT_SZ_API0: PKT_SZ_API1, compat_ptr(getb.data32), getb.alloc32); if (ret < 0) return ret; } return 0; case MON_IOCX_MFETCH32: { struct mon_bin_mfetch32 mfetch; struct mon_bin_mfetch32 __user *uptr; uptr = (struct mon_bin_mfetch32 __user *) compat_ptr(arg); if (copy_from_user(&mfetch, uptr, sizeof(mfetch))) return -EFAULT; if (mfetch.nflush32) { ret = mon_bin_flush(rp, mfetch.nflush32); if (ret < 0) return ret; if (put_user(ret, &uptr->nflush32)) return -EFAULT; } ret = mon_bin_fetch(file, rp, compat_ptr(mfetch.offvec32), mfetch.nfetch32); if (ret < 0) return ret; if (put_user(ret, &uptr->nfetch32)) return -EFAULT; } return 0; case MON_IOCG_STATS: return mon_bin_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); case MON_IOCQ_URB_LEN: case MON_IOCQ_RING_SIZE: case MON_IOCT_RING_SIZE: case MON_IOCH_MFLUSH: return mon_bin_ioctl(file, cmd, arg); default: ; } return -ENOTTY; } #endif /* CONFIG_COMPAT */ static __poll_t mon_bin_poll(struct file *file, struct poll_table_struct *wait) { struct mon_reader_bin *rp = file->private_data; __poll_t mask = 0; unsigned long flags; if (file->f_mode & FMODE_READ) poll_wait(file, &rp->b_wait, wait); spin_lock_irqsave(&rp->b_lock, flags); if (!MON_RING_EMPTY(rp)) mask |= EPOLLIN | EPOLLRDNORM; /* readable */ spin_unlock_irqrestore(&rp->b_lock, flags); return mask; } /* * open and close: just keep track of how many times the device is * mapped, to use the proper memory allocation function. */ static void mon_bin_vma_open(struct vm_area_struct *vma) { struct mon_reader_bin *rp = vma->vm_private_data; unsigned long flags; spin_lock_irqsave(&rp->b_lock, flags); rp->mmap_active++; spin_unlock_irqrestore(&rp->b_lock, flags); } static void mon_bin_vma_close(struct vm_area_struct *vma) { unsigned long flags; struct mon_reader_bin *rp = vma->vm_private_data; spin_lock_irqsave(&rp->b_lock, flags); rp->mmap_active--; spin_unlock_irqrestore(&rp->b_lock, flags); } /* * Map ring pages to user space. */ static vm_fault_t mon_bin_vma_fault(struct vm_fault *vmf) { struct mon_reader_bin *rp = vmf->vma->vm_private_data; unsigned long offset, chunk_idx; struct page *pageptr; unsigned long flags; spin_lock_irqsave(&rp->b_lock, flags); offset = vmf->pgoff << PAGE_SHIFT; if (offset >= rp->b_size) { spin_unlock_irqrestore(&rp->b_lock, flags); return VM_FAULT_SIGBUS; } chunk_idx = offset / CHUNK_SIZE; pageptr = rp->b_vec[chunk_idx].pg; get_page(pageptr); vmf->page = pageptr; spin_unlock_irqrestore(&rp->b_lock, flags); return 0; } static const struct vm_operations_struct mon_bin_vm_ops = { .open = mon_bin_vma_open, .close = mon_bin_vma_close, .fault = mon_bin_vma_fault, }; static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; if (vma->vm_flags & VM_WRITE) return -EPERM; vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; } static const struct file_operations mon_fops_binary = { .owner = THIS_MODULE, .open = mon_bin_open, .read = mon_bin_read, /* .write = mon_text_write, */ .poll = mon_bin_poll, .unlocked_ioctl = mon_bin_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mon_bin_compat_ioctl, #endif .release = mon_bin_release, .mmap = mon_bin_mmap, }; static int mon_bin_wait_event(struct file *file, struct mon_reader_bin *rp) { DECLARE_WAITQUEUE(waita, current); unsigned long flags; add_wait_queue(&rp->b_wait, &waita); set_current_state(TASK_INTERRUPTIBLE); spin_lock_irqsave(&rp->b_lock, flags); while (MON_RING_EMPTY(rp)) { spin_unlock_irqrestore(&rp->b_lock, flags); if (file->f_flags & O_NONBLOCK) { set_current_state(TASK_RUNNING); remove_wait_queue(&rp->b_wait, &waita); return -EWOULDBLOCK; /* Same as EAGAIN in Linux */ } schedule(); if (signal_pending(current)) { remove_wait_queue(&rp->b_wait, &waita); return -EINTR; } set_current_state(TASK_INTERRUPTIBLE); spin_lock_irqsave(&rp->b_lock, flags); } spin_unlock_irqrestore(&rp->b_lock, flags); set_current_state(TASK_RUNNING); remove_wait_queue(&rp->b_wait, &waita); return 0; } static int mon_alloc_buff(struct mon_pgmap *map, int npages) { int n; unsigned long vaddr; for (n = 0; n < npages; n++) { vaddr = get_zeroed_page(GFP_KERNEL); if (vaddr == 0) { while (n-- != 0) free_page((unsigned long) map[n].ptr); return -ENOMEM; } map[n].ptr = (unsigned char *) vaddr; map[n].pg = virt_to_page((void *) vaddr); } return 0; } static void mon_free_buff(struct mon_pgmap *map, int npages) { int n; for (n = 0; n < npages; n++) free_page((unsigned long) map[n].ptr); } int mon_bin_add(struct mon_bus *mbus, const struct usb_bus *ubus) { struct device *dev; unsigned minor = ubus? ubus->busnum: 0; if (minor >= MON_BIN_MAX_MINOR) return 0; dev = device_create(&mon_bin_class, ubus ? ubus->controller : NULL, MKDEV(MAJOR(mon_bin_dev0), minor), NULL, "usbmon%d", minor); if (IS_ERR(dev)) return 0; mbus->classdev = dev; return 1; } void mon_bin_del(struct mon_bus *mbus) { device_destroy(&mon_bin_class, mbus->classdev->devt); } int __init mon_bin_init(void) { int rc; rc = class_register(&mon_bin_class); if (rc) goto err_class; rc = alloc_chrdev_region(&mon_bin_dev0, 0, MON_BIN_MAX_MINOR, "usbmon"); if (rc < 0) goto err_dev; cdev_init(&mon_bin_cdev, &mon_fops_binary); mon_bin_cdev.owner = THIS_MODULE; rc = cdev_add(&mon_bin_cdev, mon_bin_dev0, MON_BIN_MAX_MINOR); if (rc < 0) goto err_add; return 0; err_add: unregister_chrdev_region(mon_bin_dev0, MON_BIN_MAX_MINOR); err_dev: class_unregister(&mon_bin_class); err_class: return rc; } void mon_bin_exit(void) { cdev_del(&mon_bin_cdev); unregister_chrdev_region(mon_bin_dev0, MON_BIN_MAX_MINOR); class_unregister(&mon_bin_class); }
708 4 165 1 7 37 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIGNAL_H #define _LINUX_SIGNAL_H #include <linux/bug.h> #include <linux/list.h> #include <linux/signal_types.h> #include <linux/string.h> struct task_struct; /* for sysctl */ extern int print_fatal_signals; static inline void copy_siginfo(kernel_siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*to)); } static inline void clear_siginfo(kernel_siginfo_t *info) { memset(info, 0, sizeof(*info)); } #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo)) static inline void copy_siginfo_to_external(siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*from)); memset(((char *)to) + sizeof(struct kernel_siginfo), 0, SI_EXPANSION_SIZE); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from); int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from); enum siginfo_layout { SIL_KILL, SIL_TIMER, SIL_POLL, SIL_FAULT, SIL_FAULT_TRAPNO, SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, SIL_FAULT_PERF_EVENT, SIL_CHLD, SIL_RT, SIL_SYS, }; enum siginfo_layout siginfo_layout(unsigned sig, int si_code); /* * Define some primitives to manipulate sigset_t. */ #ifndef __HAVE_ARCH_SIG_BITOPS #include <linux/bitops.h> /* We don't use <linux/bitops.h> for these because there is no need to be atomic. */ static inline void sigaddset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] |= 1UL << sig; else set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } static inline void sigdelset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] &= ~(1UL << sig); else set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); } static inline int sigismember(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) return 1 & (set->sig[0] >> sig); else return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); } #endif /* __HAVE_ARCH_SIG_BITOPS */ static inline int sigisemptyset(sigset_t *set) { switch (_NSIG_WORDS) { case 4: return (set->sig[3] | set->sig[2] | set->sig[1] | set->sig[0]) == 0; case 2: return (set->sig[1] | set->sig[0]) == 0; case 1: return set->sig[0] == 0; default: BUILD_BUG(); return 0; } } static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) { switch (_NSIG_WORDS) { case 4: return (set1->sig[3] == set2->sig[3]) && (set1->sig[2] == set2->sig[2]) && (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 2: return (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 1: return set1->sig[0] == set2->sig[0]; } return 0; } #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS #define _SIG_SET_BINOP(name, op) \ static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \ { \ unsigned long a0, a1, a2, a3, b0, b1, b2, b3; \ \ switch (_NSIG_WORDS) { \ case 4: \ a3 = a->sig[3]; a2 = a->sig[2]; \ b3 = b->sig[3]; b2 = b->sig[2]; \ r->sig[3] = op(a3, b3); \ r->sig[2] = op(a2, b2); \ fallthrough; \ case 2: \ a1 = a->sig[1]; b1 = b->sig[1]; \ r->sig[1] = op(a1, b1); \ fallthrough; \ case 1: \ a0 = a->sig[0]; b0 = b->sig[0]; \ r->sig[0] = op(a0, b0); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_or(x,y) ((x) | (y)) _SIG_SET_BINOP(sigorsets, _sig_or) #define _sig_and(x,y) ((x) & (y)) _SIG_SET_BINOP(sigandsets, _sig_and) #define _sig_andn(x,y) ((x) & ~(y)) _SIG_SET_BINOP(sigandnsets, _sig_andn) #undef _SIG_SET_BINOP #undef _sig_or #undef _sig_and #undef _sig_andn #define _SIG_SET_OP(name, op) \ static inline void name(sigset_t *set) \ { \ switch (_NSIG_WORDS) { \ case 4: set->sig[3] = op(set->sig[3]); \ set->sig[2] = op(set->sig[2]); \ fallthrough; \ case 2: set->sig[1] = op(set->sig[1]); \ fallthrough; \ case 1: set->sig[0] = op(set->sig[0]); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_not(x) (~(x)) _SIG_SET_OP(signotset, _sig_not) #undef _SIG_SET_OP #undef _sig_not static inline void sigemptyset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, 0, sizeof(sigset_t)); break; case 2: set->sig[1] = 0; fallthrough; case 1: set->sig[0] = 0; break; } } static inline void sigfillset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, -1, sizeof(sigset_t)); break; case 2: set->sig[1] = -1; fallthrough; case 1: set->sig[0] = -1; break; } } /* Some extensions for manipulating the low 32 signals in particular. */ static inline void sigaddsetmask(sigset_t *set, unsigned long mask) { set->sig[0] |= mask; } static inline void sigdelsetmask(sigset_t *set, unsigned long mask) { set->sig[0] &= ~mask; } static inline int sigtestsetmask(sigset_t *set, unsigned long mask) { return (set->sig[0] & mask) != 0; } static inline void siginitset(sigset_t *set, unsigned long mask) { set->sig[0] = mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; break; case 1: ; } } static inline void siginitsetinv(sigset_t *set, unsigned long mask) { set->sig[0] = ~mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; break; case 1: ; } } #endif /* __HAVE_ARCH_SIG_SETOPS */ static inline void init_sigpending(struct sigpending *sig) { sigemptyset(&sig->signal); INIT_LIST_HEAD(&sig->list); } extern void flush_sigqueue(struct sigpending *queue); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) { return sig <= _NSIG ? 1 : 0; } struct timespec; struct pt_regs; enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); #define SIG_KTHREAD ((__force __sighandler_t)2) #define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) static inline void allow_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD); } static inline void allow_kernel_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know signals sent by the kernel will be handled, so that they * don't get silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) { kernel_sigaction(sig, SIG_IGN); } extern struct kmem_cache *sighand_cachep; extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) * or to the process as a whole (Linux thread group). How the signal * is sent determines whether it's to one thread or the whole group, * which determines which signal mask(s) are involved in blocking it * from being delivered until later. When the signal is delivered, * either it's caught or ignored by a user handler or it has a default * effect that applies to the whole thread group (POSIX process). * * The possible effects an unblocked signal set to SIG_DFL can have are: * ignore - Nothing Happens * terminate - kill the process, i.e. all threads in the group, * similar to exit_group. The group leader (only) reports * WIFSIGNALED status to its parent. * coredump - write a core dump file describing all threads using * the same mm and then kill all those threads * stop - stop all the threads in the group, i.e. TASK_STOPPED state * * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. * Other signals when not blocked and set to SIG_DFL behaves as follows. * The job control signals also have other special effects. * * +--------------------+------------------+ * | POSIX signal | default action | * +--------------------+------------------+ * | SIGHUP | terminate | * | SIGINT | terminate | * | SIGQUIT | coredump | * | SIGILL | coredump | * | SIGTRAP | coredump | * | SIGABRT/SIGIOT | coredump | * | SIGBUS | coredump | * | SIGFPE | coredump | * | SIGKILL | terminate(+) | * | SIGUSR1 | terminate | * | SIGSEGV | coredump | * | SIGUSR2 | terminate | * | SIGPIPE | terminate | * | SIGALRM | terminate | * | SIGTERM | terminate | * | SIGCHLD | ignore | * | SIGCONT | ignore(*) | * | SIGSTOP | stop(*)(+) | * | SIGTSTP | stop(*) | * | SIGTTIN | stop(*) | * | SIGTTOU | stop(*) | * | SIGURG | ignore | * | SIGXCPU | coredump | * | SIGXFSZ | coredump | * | SIGVTALRM | terminate | * | SIGPROF | terminate | * | SIGPOLL/SIGIO | terminate | * | SIGSYS/SIGUNUSED | coredump | * | SIGSTKFLT | terminate | * | SIGWINCH | ignore | * | SIGPWR | terminate | * | SIGRTMIN-SIGRTMAX | terminate | * +--------------------+------------------+ * | non-POSIX signal | default action | * +--------------------+------------------+ * | SIGEMT | coredump | * +--------------------+------------------+ * * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". * (*) Special job control effects: * When SIGCONT is sent, it resumes the process (all threads in the group) * from TASK_STOPPED state and also clears any pending/queued stop signals * (any of those marked with "stop(*)"). This happens regardless of blocking, * catching, or ignoring SIGCONT. When any stop signal is sent, it clears * any pending/queued SIGCONT signals; this happens regardless of blocking, * catching, or ignored the stop signal, though (except for SIGSTOP) the * default action of stopping the process may happen later or never. */ #ifdef SIGEMT #define SIGEMT_MASK rt_sigmask(SIGEMT) #else #define SIGEMT_MASK 0 #endif #if SIGRTMIN > BITS_PER_LONG #define rt_sigmask(sig) (1ULL << ((sig)-1)) #else #define rt_sigmask(sig) sigmask(sig) #endif #define siginmask(sig, mask) \ ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask))) #define SIG_KERNEL_ONLY_MASK (\ rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP)) #define SIG_KERNEL_STOP_MASK (\ rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \ rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) ) #define SIG_KERNEL_COREDUMP_MASK (\ rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \ rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \ rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \ rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \ SIGEMT_MASK ) #define SIG_KERNEL_IGNORE_MASK (\ rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) ) #define SIG_SPECIFIC_SICODES_MASK (\ rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \ rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \ SIGEMT_MASK ) #define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK) #define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK) #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) #define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK) #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) void signals_init(void); int restore_altstack(const stack_t __user *); int __save_altstack(stack_t __user *, unsigned long); #define unsafe_save_altstack(uss, sp, label) do { \ stack_t __user *__uss = uss; \ struct task_struct *t = current; \ unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \ unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \ unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); #ifdef CONFIG_DYNAMIC_SIGFRAME bool sigaltstack_size_valid(size_t ss_size); #else static inline bool sigaltstack_size_valid(size_t size) { return true; } #endif /* !CONFIG_DYNAMIC_SIGFRAME */ #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif #ifndef arch_untagged_si_addr /* * Given a fault address and a signal and si_code which correspond to the * _sigfault union member, returns the address that must appear in si_addr if * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags. */ static inline void __user *arch_untagged_si_addr(void __user *addr, unsigned long sig, unsigned long si_code) { return addr; } #endif #endif /* _LINUX_SIGNAL_H */
5 1 1 2 1 2 6 1 1 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits * * (C) 2002 by Harald Welte <laforge@gnumonks.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/ip.h> #include <net/ip.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_ecn.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_ecn"); MODULE_ALIAS("ip6t_ecn"); static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ecn_info *einfo = par->matchinfo; struct tcphdr _tcph; const struct tcphdr *th; /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) return false; if (einfo->operation & XT_ECN_OP_MATCH_ECE) { if (einfo->invert & XT_ECN_OP_MATCH_ECE) { if (th->ece == 1) return false; } else { if (th->ece == 0) return false; } } if (einfo->operation & XT_ECN_OP_MATCH_CWR) { if (einfo->invert & XT_ECN_OP_MATCH_CWR) { if (th->cwr == 1) return false; } else { if (th->cwr == 0) return false; } } return true; } static inline bool match_ip(const struct sk_buff *skb, const struct xt_ecn_info *einfo) { return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^ !!(einfo->invert & XT_ECN_OP_MATCH_IP); } static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ecn_info *info = par->matchinfo; if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info)) return false; if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && !match_tcp(skb, par)) return false; return true; } static int ecn_mt_check4(const struct xt_mtchk_param *par) { const struct xt_ecn_info *info = par->matchinfo; const struct ipt_ip *ip = par->entryinfo; if (info->operation & XT_ECN_OP_MATCH_MASK) return -EINVAL; if (info->invert & XT_ECN_OP_MATCH_MASK) return -EINVAL; if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { pr_info_ratelimited("cannot match TCP bits for non-tcp packets\n"); return -EINVAL; } return 0; } static inline bool match_ipv6(const struct sk_buff *skb, const struct xt_ecn_info *einfo) { return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) == einfo->ip_ect) ^ !!(einfo->invert & XT_ECN_OP_MATCH_IP); } static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_ecn_info *info = par->matchinfo; if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info)) return false; if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && !match_tcp(skb, par)) return false; return true; } static int ecn_mt_check6(const struct xt_mtchk_param *par) { const struct xt_ecn_info *info = par->matchinfo; const struct ip6t_ip6 *ip = par->entryinfo; if (info->operation & XT_ECN_OP_MATCH_MASK) return -EINVAL; if (info->invert & XT_ECN_OP_MATCH_MASK) return -EINVAL; if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) { pr_info_ratelimited("cannot match TCP bits for non-tcp packets\n"); return -EINVAL; } return 0; } static struct xt_match ecn_mt_reg[] __read_mostly = { { .name = "ecn", .family = NFPROTO_IPV4, .match = ecn_mt4, .matchsize = sizeof(struct xt_ecn_info), .checkentry = ecn_mt_check4, .me = THIS_MODULE, }, { .name = "ecn", .family = NFPROTO_IPV6, .match = ecn_mt6, .matchsize = sizeof(struct xt_ecn_info), .checkentry = ecn_mt_check6, .me = THIS_MODULE, }, }; static int __init ecn_mt_init(void) { return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); } static void __exit ecn_mt_exit(void) { xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); } module_init(ecn_mt_init); module_exit(ecn_mt_exit);
14 2 7 2 1 2 2 2 5 5 83 20 21 10 2 8 1 1 1 1 8 3 5 16 16 3 3 3 10 10 1 5 4 1 1 1 54 1 53 1 3 51 51 50 5 1 1 1 1 7 7 2 2 7 2 10 11 1 6 5 3 3 5 1 1 1 3 7 7 2 3 1 29 1 1 1 23 2 2 21 4 1 1 22 1 2 20 4 12 9 2 9 6 3 1 8 8 1 1 1 1 2 1 1 1 5 11 11 1 1 1 4 1 1 1 3 1 1 1 5 26 1 8 17 23 7 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE802154.4 socket interface * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> */ #include <linux/net.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <net/datalink.h> #include <net/psnap.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/route.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> /* Utility function for families */ static struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { struct net_device *dev = NULL; struct net_device *tmp; __le16 pan_id, short_addr; u8 hwaddr[IEEE802154_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) break; rtnl_lock(); for_each_netdev(net, tmp) { if (tmp->type != ARPHRD_IEEE802154) continue; pan_id = tmp->ieee802154_ptr->pan_id; short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; dev_hold(dev); break; } } rtnl_unlock(); break; default: pr_warn("Unsupported ieee802154 address type: %d\n", addr->mode); break; } return dev; } static int ieee802154_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; return sk->sk_prot->sendmsg(sk, msg, len); } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, uaddr, addr_len); return sock_no_bind(sock, uaddr, addr_len); } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); return sk->sk_prot->connect(sk, uaddr, addr_len); } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; struct net_device *dev; if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; dev_load(sock_net(sk), ifr.ifr_name); dev = dev_get_by_name(sock_net(sk), ifr.ifr_name); if (!dev) return -ENODEV; if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); return ret; } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, cmd); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; return sk_ioctl(sk, cmd, (void __user *)arg); } } /* RAW Sockets (802.15.4 created in userspace) */ static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); write_unlock_bh(&raw_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void raw_unhash(struct sock *sk) { write_lock_bh(&raw_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&raw_lock); } static void raw_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; int err = 0; struct net_device *dev = NULL; err = ieee802154_sockaddr_check_size(uaddr, len); if (err < 0) return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); ieee802154_addr_from_sa(&addr, &uaddr->addr); dev = ieee802154_get_dev(sock_net(sk), &addr); if (!dev) { err = -ENODEV; goto out; } sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); dev_put(dev); out: release_sock(sk); return err; } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { return -ENOTSUPP; } static int raw_disconnect(struct sock *sk, int flags) { return 0; } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } lock_sock(sk); if (!sk->sk_bound_dev_if) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); release_sock(sk); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } if (!size) { err = 0; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_mac_header(skb); skb_reset_network_header(skb); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk; read_lock(&raw_lock); sk_for_each(sk, &raw_head) { bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) raw_rcv_skb(sk, clone); } bh_unlock_sock(sk); } read_unlock(&raw_lock); } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } static struct proto ieee802154_raw_prot = { .name = "IEEE-802.15.4-RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), .close = raw_close, .bind = raw_bind, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .hash = raw_hash, .unhash = raw_unhash, .connect = raw_connect, .disconnect = raw_disconnect, .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; static const struct proto_ops ieee802154_raw_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; /* DGRAM Sockets (802.15.4 dataframes) */ static HLIST_HEAD(dgram_head); static DEFINE_RWLOCK(dgram_lock); struct dgram_sock { struct sock sk; struct ieee802154_addr src_addr; struct ieee802154_addr dst_addr; unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; unsigned int want_lqi:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; unsigned int seclevel_override:1; }; static inline struct dgram_sock *dgram_sk(const struct sock *sk) { return container_of(sk, struct dgram_sock, sk); } static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); write_unlock_bh(&dgram_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void dgram_unhash(struct sock *sk) { write_lock_bh(&dgram_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&dgram_lock); } static int dgram_init(struct sock *sk) { struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; ro->want_lqi = 0; return 0; } static void dgram_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; struct dgram_sock *ro = dgram_sk(sk); int err = -EINVAL; struct net_device *dev; lock_sock(sk); ro->bound = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) goto out; if (addr->family != AF_IEEE802154) { err = -EINVAL; goto out; } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_IEEE802154) { err = -ENODEV; goto out_put; } ro->src_addr = haddr; ro->bound = 1; err = 0; out_put: dev_put(dev); out: release_sock(sk); return err; } static int dgram_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; *karg = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { /* We will only return the amount * of this packet since that is all * that will be read. */ *karg = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } } return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); int err = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) return err; if (addr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); if (!ro->bound) { err = -ENETUNREACH; goto out; } ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr); ro->connected = 1; out: release_sock(sk); return err; } static int dgram_disconnect(struct sock *sk, int flags) { struct dgram_sock *ro = dgram_sk(sk); lock_sock(sk); ro->connected = 0; release_sock(sk); return 0; } static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } if (msg->msg_name) { if (ro->connected) return -EISCONN; if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) return -EINVAL; err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); if (err < 0) return err; ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { if (!ro->connected) return -EDESTADDRREQ; dst_addr = ro->dst_addr; } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_network_header(skb); cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; err = wpan_dev_hard_header(skb, dev, &dst_addr, ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; struct dgram_sock *ro = dgram_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* FIXME: skip headers if necessary ?! */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (saddr) { /* Clear the implicit padding in struct sockaddr_ieee802154 * (16 bits between 'family' and 'addr') and in struct * ieee802154_addr_sa (16 bits at the end of the structure). */ memset(saddr, 0, sizeof(*saddr)); saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); } if (ro->want_lqi) { err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI, sizeof(uint8_t), &(mac_cb(skb)->lqi)); if (err) goto done; } if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static inline bool ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr, struct dgram_sock *ro) { if (!ro->bound) return true; if (ro->src_addr.mode == IEEE802154_ADDR_LONG && hw_addr == ro->src_addr.extended_addr) return true; if (ro->src_addr.mode == IEEE802154_ADDR_SHORT && pan_id == ro->src_addr.pan_id && short_addr == ro->src_addr.short_addr) return true; return false; } static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; int ret = NET_RX_SUCCESS; __le16 pan_id, short_addr; __le64 hw_addr; /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); pan_id = dev->ieee802154_ptr->pan_id; short_addr = dev->ieee802154_ptr->short_addr; hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { if (ieee802154_match_sock(hw_addr, pan_id, short_addr, dgram_sk(sk))) { if (prev) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); } prev = sk; } } if (prev) { dgram_rcv_skb(prev, skb); } else { kfree_skb(skb); ret = NET_RX_DROP; } read_unlock(&dgram_lock); return ret; } static int dgram_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); int val, len; if (level != SOL_IEEE802154) return -EOPNOTSUPP; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case WPAN_WANTACK: val = ro->want_ack; break; case WPAN_WANTLQI: val = ro->want_lqi; break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; else if (ro->secen) val = WPAN_SECURITY_ON; else val = WPAN_SECURITY_OFF; break; case WPAN_SECURITY_LEVEL: if (!ro->seclevel_override) val = WPAN_SECURITY_LEVEL_DEFAULT; else val = ro->seclevel; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int dgram_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); int val; int err = 0; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); switch (optname) { case WPAN_WANTACK: ro->want_ack = !!val; break; case WPAN_WANTLQI: ro->want_lqi = !!val; break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } switch (val) { case WPAN_SECURITY_DEFAULT: ro->secen_override = 0; break; case WPAN_SECURITY_ON: ro->secen_override = 1; ro->secen = 1; break; case WPAN_SECURITY_OFF: ro->secen_override = 1; ro->secen = 0; break; default: err = -EINVAL; break; } break; case WPAN_SECURITY_LEVEL: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } if (val < WPAN_SECURITY_LEVEL_DEFAULT || val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) { err = -EINVAL; } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) { ro->seclevel_override = 0; } else { ro->seclevel_override = 1; ro->seclevel = val; } break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static struct proto ieee802154_dgram_prot = { .name = "IEEE-802.15.4-MAC", .owner = THIS_MODULE, .obj_size = sizeof(struct dgram_sock), .init = dgram_init, .close = dgram_close, .bind = dgram_bind, .sendmsg = dgram_sendmsg, .recvmsg = dgram_recvmsg, .hash = dgram_hash, .unhash = dgram_unhash, .connect = dgram_connect, .disconnect = dgram_disconnect, .ioctl = dgram_ioctl, .getsockopt = dgram_getsockopt, .setsockopt = dgram_setsockopt, }; static const struct proto_ops ieee802154_dgram_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static void ieee802154_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } /* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc; struct proto *proto; const struct proto_ops *ops; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; switch (sock->type) { case SOCK_RAW: rc = -EPERM; if (!capable(CAP_NET_RAW)) goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; case SOCK_DGRAM: proto = &ieee802154_dgram_prot; ops = &ieee802154_dgram_ops; break; default: rc = -ESOCKTNOSUPPORT; goto out; } rc = -ENOMEM; sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; sock->ops = ops; sock_init_data(sock, sk); sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); if (rc) goto out_sk_release; } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) goto out_sk_release; } out: return rc; out_sk_release: sk_common_release(sk); sock->sk = NULL; goto out; } static const struct net_proto_family ieee802154_family_ops = { .family = PF_IEEE802154, .create = ieee802154_create, .owner = THIS_MODULE, }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) goto drop; ieee802154_raw_deliver(dev, skb); if (dev->type != ARPHRD_IEEE802154) goto drop; if (skb->pkt_type != PACKET_OTHERHOST) return ieee802154_dgram_deliver(dev, skb); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type ieee802154_packet_type = { .type = htons(ETH_P_IEEE802154), .func = ieee802154_rcv, }; static int __init af_ieee802154_init(void) { int rc; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) goto out; rc = proto_register(&ieee802154_dgram_prot, 1); if (rc) goto err_dgram; /* Tell SOCKET that we are alive */ rc = sock_register(&ieee802154_family_ops); if (rc) goto err_sock; dev_add_pack(&ieee802154_packet_type); rc = 0; goto out; err_sock: proto_unregister(&ieee802154_dgram_prot); err_dgram: proto_unregister(&ieee802154_raw_prot); out: return rc; } static void __exit af_ieee802154_remove(void) { dev_remove_pack(&ieee802154_packet_type); sock_unregister(PF_IEEE802154); proto_unregister(&ieee802154_dgram_prot); proto_unregister(&ieee802154_raw_prot); } module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IEEE 802.15.4 socket interface"); MODULE_ALIAS_NETPROTO(PF_IEEE802154);
3744 4 4 3552 146 3411 3408 2412 120 2293 68 69 4296 211 4328 4333 209 4292 4290 4293 4293 124 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ device_links_read_lock_held()) /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static int async_error; static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = from_timer(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; timer_delete_sync(timer); destroy_timer_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool dpm_async_fn(struct device *dev, async_func_t func) { if (!is_async(dev)) return false; dev->power.work_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); /* * async_schedule_dev_nocall() above has returned false, so func() is * not running and it is safe to update power.work_in_progress without * extra synchronization. */ dev->power.work_in_progress = false; return false; } static void dpm_clear_async_state(struct device *dev) { reinit_completion(&dev->power.completion); dev->power.work_in_progress = false; } /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.smart_suspend flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_smart_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) { dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume_noirq); } while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume_early); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (!dev->power.is_suspended) goto Complete; if (dev->power.direct_complete) { /* * Allow new children to be added under the device after this * point if it has no PM callbacks. */ if (dev->power.no_pm_callbacks) dev->power.is_prepared = false; /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (async_error) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: /* If enabling runtime PM for the device is blocked, unblock it. */ pm_runtime_unblock(dev); pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Devices must be resumed unless they are explicitly allowed to be left * in suspend, but even in that case skipping the resume of devices that * were in use right before the system suspend (as indicated by their * runtime PM usage counters and child counters) would be suboptimal. */ if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); /* * Disable runtime PM for the device without checking if there is a * pending resume request for it. */ __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (pm_wakeup_pending()) { async_error = -EBUSY; goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); dev->power.is_suspended = true; goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } static bool device_prepare_smart_suspend(struct device *dev) { struct device_link *link; bool ret = true; int idx; /* * The "smart suspend" feature is enabled for devices whose drivers ask * for it and for devices without PM callbacks. * * However, if "smart suspend" is not enabled for the device's parent * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ if (!dev->power.no_pm_callbacks && !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) return false; if (dev->parent && !dev_pm_smart_suspend(dev->parent) && !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) return false; idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; if (!dev_pm_smart_suspend(link->supplier) && !pm_runtime_blocked(link->supplier)) { ret = false; break; } } device_links_read_unlock(idx); return ret; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; bool smart_suspend; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); /* * If runtime PM is disabled for the device at this point and it has * never been enabled so far, it should not be enabled until this system * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ smart_suspend = !pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* Do not enable "smart suspend" for devices with disabled runtime PM. */ if (smart_suspend) smart_suspend = device_prepare_smart_suspend(dev); spin_lock_irq(&dev->power.lock); dev->power.smart_suspend = smart_suspend; /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else error = dpm_suspend(state); dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev); }
132 534 9 9 9 9 574 574 574 1903 1533 568 1519 595 1527 1526 1523 139 139 139 139 138 1 1 1 1 10 2 4 3 6 2 6 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { struct io_wq_hash *hash; struct io_wq_data data; unsigned int concurrency; mutex_lock(&ctx->uring_lock); hash = ctx->hash_map; if (!hash) { hash = kzalloc(sizeof(*hash), GFP_KERNEL); if (!hash) { mutex_unlock(&ctx->uring_lock); return ERR_PTR(-ENOMEM); } refcount_set(&hash->refs, 1); init_waitqueue_head(&hash->wait); ctx->hash_map = hash; } mutex_unlock(&ctx->uring_lock); data.hash = hash; data.task = task; data.free_work = io_wq_free_work; data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); return io_wq_create(concurrency, &data); } void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; struct io_tctx_node *node; unsigned long index; /* * Fault injection forcing allocation errors in the xa_store() path * can lead to xa_empty() returning false, even though no actual * node is stored in the xarray. Until that gets sorted out, attempt * an iteration here and warn if any entries are found. */ xa_for_each(&tctx->xa, index, node) { WARN_ON_ONCE(1); break; } WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } __cold int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { kfree(tctx); return ret; } tctx->io_wq = io_init_wq_offload(ctx, task); if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); return ret; } tctx->task = task; xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; tctx = current->io_uring; if (ctx->iowq_limits_set) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) return ret; } } if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->ctx = ctx; node->task = current; ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, node, GFP_KERNEL)); if (ret) { kfree(node); return ret; } mutex_lock(&ctx->uring_lock); list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } return 0; } int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) { int ret; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && ctx->submitter_task != current) return -EEXIST; ret = __io_uring_add_tctx_node(ctx); if (ret) return ret; current->io_uring->last = ctx; return 0; } /* * Remove this io_uring_file -> task mapping. */ __cold void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; if (!tctx) return; node = xa_erase(&tctx->xa, index); if (!node) return; WARN_ON_ONCE(current != node->task); WARN_ON_ONCE(list_empty(&node->ctx_node)); mutex_lock(&node->ctx->uring_lock); list_del(&node->ctx_node); mutex_unlock(&node->ctx->uring_lock); if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); } __cold void io_uring_clean_tctx(struct io_uring_task *tctx) { struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; unsigned long index; xa_for_each(&tctx->xa, index, node) { io_uring_del_tctx_node(index); cond_resched(); } if (wq) { /* * Must be after io_uring_del_tctx_node() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ io_wq_put_and_exit(wq); tctx->io_wq = NULL; } } void io_uring_unreg_ringfd(void) { struct io_uring_task *tctx = current->io_uring; int i; for (i = 0; i < IO_RINGFD_REG_MAX; i++) { if (tctx->registered_rings[i]) { fput(tctx->registered_rings[i]); tctx->registered_rings[i] = NULL; } } } int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { int offset; for (offset = start; offset < end; offset++) { offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[offset]) continue; tctx->registered_rings[offset] = file; return offset; } return -EBUSY; } static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, int start, int end) { struct file *file; int offset; file = fget(fd); if (!file) { return -EBADF; } else if (!io_is_uring_fops(file)) { fput(file); return -EOPNOTSUPP; } offset = io_ring_add_registered_file(tctx, file, start, end); if (offset < 0) fput(file); return offset; } /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update * with ->data set to the ring_fd, and ->offset given for the desired * index. If no index is desired, application may set ->offset == -1U * and we'll find an available index. Returns number of entries * successfully processed, or < 0 on error if none were processed. */ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) { struct io_uring_rsrc_update __user *arg = __arg; struct io_uring_rsrc_update reg; struct io_uring_task *tctx; int ret, i; if (!nr_args || nr_args > IO_RINGFD_REG_MAX) return -EINVAL; mutex_unlock(&ctx->uring_lock); ret = __io_uring_add_tctx_node(ctx); mutex_lock(&ctx->uring_lock); if (ret) return ret; tctx = current->io_uring; for (i = 0; i < nr_args; i++) { int start, end; if (copy_from_user(&reg, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } if (reg.resv) { ret = -EINVAL; break; } if (reg.offset == -1U) { start = 0; end = IO_RINGFD_REG_MAX; } else { if (reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } start = reg.offset; end = start + 1; } ret = io_ring_add_registered_fd(tctx, reg.data, start, end); if (ret < 0) break; reg.offset = ret; if (copy_to_user(&arg[i], &reg, sizeof(reg))) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT; break; } } return i ? i : ret; } int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) { struct io_uring_rsrc_update __user *arg = __arg; struct io_uring_task *tctx = current->io_uring; struct io_uring_rsrc_update reg; int ret = 0, i; if (!nr_args || nr_args > IO_RINGFD_REG_MAX) return -EINVAL; if (!tctx) return 0; for (i = 0; i < nr_args; i++) { if (copy_from_user(&reg, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[reg.offset]) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; } } return i ? i : ret; }
471 471 4 4 525 29 500 1298 206 1123 1 1 1 1 1 1 471 471 471 471 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <net/ip_vs.h> /* * IPVS protocols can only be registered/unregistered when the ipvs * module is loaded/unloaded, so no lock is needed in accessing the * ipvs protocol table. */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; /* States for conn templates: NONE or words separated with ",", max 15 chars */ static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { [IP_VS_CTPL_S_NONE] = "NONE", [IP_VS_CTPL_S_ASSURED] = "ASSURED", }; /* * register an ipvs protocol */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; if (pp->init != NULL) pp->init(pp); return 0; } /* * register an ipvs protocols netns related data */ static int register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; pd->pp = pp; /* For speed issues */ pd->next = ipvs->proto_data_table[hash]; ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; kfree(pd); return ret; } } return 0; } /* * unregister an ipvs protocol */ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { if (*pp_p == pp) { *pp_p = pp->next; if (pp->exit != NULL) pp->exit(pp); return 0; } } return -ESRCH; } /* * unregister an ipvs protocols netns data */ static int unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } } return -ESRCH; } /* * get ip_vs_protocol object by its proto. */ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) return pp; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ struct ip_vs_proto_data * ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) return pd; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { if (pd->pp->timeout_change) pd->pp->timeout_change(pd, flags); } } } int * ip_vs_create_timeout_table(int *table, int size) { return kmemdup(table, size, GFP_KERNEL); } const char *ip_vs_state_name(const struct ip_vs_conn *cp) { unsigned int state = cp->state; struct ip_vs_protocol *pp; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { if (state >= IP_VS_CTPL_S_LAST) return "ERR!"; return ip_vs_ctpl_state_name_table[state] ? : "?"; } pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; return pp->state_name(state); } static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[128]; struct iphdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI4->%pI4", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI4:%u->%pI4:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #ifdef CONFIG_IP_VS_IPV6 static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[192]; struct ipv6hdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #endif void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } /* * per network name-space init */ int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP &ip_vs_protocol_esp, #endif }; for (i = 0; i < ARRAY_SIZE(protos); i++) { ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: ip_vs_protocol_net_cleanup(ipvs); return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) unregister_ip_vs_proto_netns(ipvs, pd); } } int __init ip_vs_protocol_init(void) { char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ strcat(protocols, ", "); \ strcat(protocols, (p)->name); \ } while (0) #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif #ifdef CONFIG_IP_VS_PROTO_UDP REGISTER_PROTOCOL(&ip_vs_protocol_udp); #endif #ifdef CONFIG_IP_VS_PROTO_SCTP REGISTER_PROTOCOL(&ip_vs_protocol_sctp); #endif #ifdef CONFIG_IP_VS_PROTO_AH REGISTER_PROTOCOL(&ip_vs_protocol_ah); #endif #ifdef CONFIG_IP_VS_PROTO_ESP REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); return 0; } void ip_vs_protocol_cleanup(void) { struct ip_vs_protocol *pp; int i; /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) unregister_ip_vs_protocol(pp); } }
46 46 23 25 2 23 17 17 17 14 5 14 14 17 17 14 14 1 14 1 14 14 6 6 6 13 12 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/ackvec.c * * An implementation of Ack Vectors for the DCCP protocol * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> */ #include "dccp.h" #include <linux/kernel.h> #include <linux/slab.h> #include <linux/export.h> static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); if (av != NULL) { av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; INIT_LIST_HEAD(&av->av_records); } return av; } static void dccp_ackvec_purge_records(struct dccp_ackvec *av) { struct dccp_ackvec_record *cur, *next; list_for_each_entry_safe(cur, next, &av->av_records, avr_node) kmem_cache_free(dccp_ackvec_record_slab, cur); INIT_LIST_HEAD(&av->av_records); } void dccp_ackvec_free(struct dccp_ackvec *av) { if (likely(av != NULL)) { dccp_ackvec_purge_records(av); kmem_cache_free(dccp_ackvec_slab, av); } } /** * dccp_ackvec_update_records - Record information about sent Ack Vectors * @av: Ack Vector records to update * @seqno: Sequence number of the packet carrying the Ack Vector just sent * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector */ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) { struct dccp_ackvec_record *avr; avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr == NULL) return -ENOBUFS; avr->avr_ack_seqno = seqno; avr->avr_ack_ptr = av->av_buf_head; avr->avr_ack_ackno = av->av_buf_ackno; avr->avr_ack_nonce = nonce_sum; avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); /* * When the buffer overflows, we keep no more than one record. This is * the simplest way of disambiguating sender-Acks dating from before the * overflow from sender-Acks which refer to after the overflow; a simple * solution is preferable here since we are handling an exception. */ if (av->av_overflow) dccp_ackvec_purge_records(av); /* * Since GSS is incremented for each packet, the list is automatically * arranged in descending order of @ack_seqno. */ list_add(&avr->avr_node, &av->av_records); dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", (unsigned long long)avr->avr_ack_seqno, (unsigned long long)avr->avr_ack_ackno, avr->avr_ack_runlen); return 0; } static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, const u64 ackno) { struct dccp_ackvec_record *avr; /* * Exploit that records are inserted in descending order of sequence * number, start with the oldest record first. If @ackno is `before' * the earliest ack_ackno, the packet is too old to be considered. */ list_for_each_entry_reverse(avr, av_list, avr_node) { if (avr->avr_ack_seqno == ackno) return avr; if (before48(ackno, avr->avr_ack_seqno)) break; } return NULL; } /* * Buffer index and length computation using modulo-buffersize arithmetic. * Note that, as pointers move from right to left, head is `before' tail. */ static inline u16 __ackvec_idx_add(const u16 a, const u16 b) { return (a + b) % DCCPAV_MAX_ACKVEC_LEN; } static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) { return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); } u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) { if (unlikely(av->av_overflow)) return DCCPAV_MAX_ACKVEC_LEN; return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); } /** * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 * @av: non-empty buffer to update * @distance: negative or zero distance of @seqno from buf_ackno downward * @seqno: the (old) sequence number whose record is to be updated * @state: state in which packet carrying @seqno was received */ static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, u64 seqno, enum dccp_ackvec_states state) { u16 ptr = av->av_buf_head; BUG_ON(distance > 0); if (unlikely(dccp_ackvec_is_empty(av))) return; do { u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); if (distance + runlen >= 0) { /* * Only update the state if packet has not been received * yet. This is OK as per the second table in RFC 4340, * 11.4.1; i.e. here we are using the following table: * RECEIVED * 0 1 3 * S +---+---+---+ * T 0 | 0 | 0 | 0 | * O +---+---+---+ * R 1 | 1 | 1 | 1 | * E +---+---+---+ * D 3 | 0 | 1 | 3 | * +---+---+---+ * The "Not Received" state was set by reserve_seats(). */ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) av->av_buf[ptr] = state; else dccp_pr_debug("Not changing %llu state to %u\n", (unsigned long long)seqno, state); break; } distance += runlen + 1; ptr = __ackvec_idx_add(ptr, 1); } while (ptr != av->av_buf_tail); } /* Mark @num entries after buf_head as "Not yet received". */ static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) { u16 start = __ackvec_idx_add(av->av_buf_head, 1), len = DCCPAV_MAX_ACKVEC_LEN - start; /* check for buffer wrap-around */ if (num > len) { memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); start = 0; num -= len; } if (num) memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); } /** * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer * @av: container of buffer to update (can be empty or non-empty) * @num_packets: number of packets to register (must be >= 1) * @seqno: sequence number of the first packet in @num_packets * @state: state in which packet carrying @seqno was received */ static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, u64 seqno, enum dccp_ackvec_states state) { u32 num_cells = num_packets; if (num_packets > DCCPAV_BURST_THRESH) { u32 lost_packets = num_packets - 1; DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); /* * We received 1 packet and have a loss of size "num_packets-1" * which we squeeze into num_cells-1 rather than reserving an * entire byte for each lost packet. * The reason is that the vector grows in O(burst_length); when * it grows too large there will no room left for the payload. * This is a trade-off: if a few packets out of the burst show * up later, their state will not be changed; it is simply too * costly to reshuffle/reallocate/copy the buffer each time. * Should such problems persist, we will need to switch to a * different underlying data structure. */ for (num_packets = num_cells = 1; lost_packets; ++num_cells) { u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; lost_packets -= len; } } if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); av->av_overflow = true; } av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); if (av->av_overflow) av->av_buf_tail = av->av_buf_head; av->av_buf[av->av_buf_head] = state; av->av_buf_ackno = seqno; if (num_packets > 1) dccp_ackvec_reserve_seats(av, num_packets - 1); } /** * dccp_ackvec_input - Register incoming packet in the buffer * @av: Ack Vector to register packet to * @skb: Packet to register */ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) { u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; enum dccp_ackvec_states state = DCCPAV_RECEIVED; if (dccp_ackvec_is_empty(av)) { dccp_ackvec_add_new(av, 1, seqno, state); av->av_tail_ackno = seqno; } else { s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); u8 *current_head = av->av_buf + av->av_buf_head; if (num_packets == 1 && dccp_ackvec_state(current_head) == state && dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { *current_head += 1; av->av_buf_ackno = seqno; } else if (num_packets > 0) { dccp_ackvec_add_new(av, num_packets, seqno, state); } else { dccp_ackvec_update_old(av, num_packets, seqno, state); } } } /** * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection * @av: Ack Vector record to clean * @ackno: last Ack Vector which has been acknowledged * * This routine is called when the peer acknowledges the receipt of Ack Vectors * up to and including @ackno. While based on section A.3 of RFC 4340, here * are additional precautions to prevent corrupted buffer state. In particular, * we use tail_ackno to identify outdated records; it always marks the earliest * packet of group (2) in 11.4.2. */ void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) { struct dccp_ackvec_record *avr, *next; u8 runlen_now, eff_runlen; s64 delta; avr = dccp_ackvec_lookup(&av->av_records, ackno); if (avr == NULL) return; /* * Deal with outdated acknowledgments: this arises when e.g. there are * several old records and the acks from the peer come in slowly. In * that case we may still have records that pre-date tail_ackno. */ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); if (delta < 0) goto free_records; /* * Deal with overlapping Ack Vectors: don't subtract more than the * number of packets between tail_ackno and ack_ackno. */ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); /* * The run length of Ack Vector cells does not decrease over time. If * the run length is the same as at the time the Ack Vector was sent, we * free the ack_ptr cell. That cell can however not be freed if the run * length has increased: in this case we need to move the tail pointer * backwards (towards higher indices), to its next-oldest neighbour. */ if (runlen_now > eff_runlen) { av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); /* This move may not have cleared the overflow flag. */ if (av->av_overflow) av->av_overflow = (av->av_buf_head == av->av_buf_tail); } else { av->av_buf_tail = avr->avr_ack_ptr; /* * We have made sure that avr points to a valid cell within the * buffer. This cell is either older than head, or equals head * (empty buffer): in both cases we no longer have any overflow. */ av->av_overflow = 0; } /* * The peer has acknowledged up to and including ack_ackno. Hence the * first packet in group (2) of 11.4.2 is the successor of ack_ackno. */ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); free_records: list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { list_del(&avr->avr_node); kmem_cache_free(dccp_ackvec_record_slab, avr); } } /* * Routines to keep track of Ack Vectors received in an skb */ int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) { struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); if (new == NULL) return -ENOBUFS; new->vec = vec; new->len = len; new->nonce = nonce; list_add_tail(&new->node, head); return 0; } EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) { struct dccp_ackvec_parsed *cur, *next; list_for_each_entry_safe(cur, next, parsed_chunks, node) kfree(cur); INIT_LIST_HEAD(parsed_chunks); } EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_slab == NULL) goto out_err; dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; return 0; out_destroy_slab: kmem_cache_destroy(dccp_ackvec_slab); dccp_ackvec_slab = NULL; out_err: DCCP_CRIT("Unable to create Ack Vector slab cache"); return -ENOBUFS; } void dccp_ackvec_exit(void) { kmem_cache_destroy(dccp_ackvec_slab); dccp_ackvec_slab = NULL; kmem_cache_destroy(dccp_ackvec_record_slab); dccp_ackvec_record_slab = NULL; }
103 103 103 103 103 103 103 103 103 103 103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* mpiutil.ac - Utility functions for MPI * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include "mpi-internal.h" /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need * integers of n bits - So we should change this to bits (or bytes). * * But mpi_alloc is used in a lot of places :-) */ MPI mpi_alloc(unsigned nlimbs) { MPI a; a = kmalloc(sizeof *a, GFP_KERNEL); if (!a) return a; if (nlimbs) { a->d = mpi_alloc_limb_space(nlimbs); if (!a->d) { kfree(a); return NULL; } } else { a->d = NULL; } a->alloced = nlimbs; a->nlimbs = 0; a->sign = 0; a->flags = 0; a->nbits = 0; return a; } EXPORT_SYMBOL_GPL(mpi_alloc); mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs) { size_t len = nlimbs * sizeof(mpi_limb_t); if (!len) return NULL; return kmalloc(len, GFP_KERNEL); } void mpi_free_limb_space(mpi_ptr_t a) { if (!a) return; kfree_sensitive(a); } void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) { mpi_free_limb_space(a->d); a->d = ap; a->alloced = nlimbs; } /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) { void *p; if (nlimbs <= a->alloced) return 0; /* no need to do it */ if (a->d) { p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); kfree_sensitive(a->d); a->d = p; } else { a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!a->d) return -ENOMEM; } a->alloced = nlimbs; return 0; } void mpi_free(MPI a) { if (!a) return; if (a->flags & 4) kfree_sensitive(a->d); else mpi_free_limb_space(a->d); if (a->flags & ~7) pr_info("invalid flag value in mpi\n"); kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); /**************** * Note: This copy function should not interpret the MPI * but copy it transparently. */ MPI mpi_copy(MPI a) { int i; MPI b; if (a) { b = mpi_alloc(a->nlimbs); if (!b) return NULL; b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; b->flags &= ~(16|32); /* Reset the immutable and constant flags. */ for (i = 0; i < b->nlimbs; i++) b->d[i] = a->d[i]; } else b = NULL; return b; } MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL");
103 2259 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
30830 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> #include <linux/cleanup.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline void jump_label_init_ro(void) { } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
2250 147 2125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. * * This code is based in part on work published here: * * https://github.com/IAIK/KAISER * * The original work was written by and signed off by for the Linux * kernel by: * * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> * * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and * Andy Lutomirsky <luto@amacapital.net> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> #include <asm/set_memory.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt /* Backporting helper */ #ifndef __GFP_NOTRACK #define __GFP_NOTRACK 0 #endif /* * Define the page-table levels we clone for user-space on 32 * and 64 bit. */ #ifdef CONFIG_X86_64 #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD #else #define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE #endif static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } /* Assume mode is auto unless overridden via cmdline below. */ static enum pti_mode { PTI_AUTO = 0, PTI_FORCE_OFF, PTI_FORCE_ON } pti_mode; void __init pti_check_boottime_disable(void) { if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on XEN PV."); return; } if (cpu_mitigations_off()) pti_mode = PTI_FORCE_OFF; if (pti_mode == PTI_FORCE_OFF) { pti_print_if_insecure("disabled on command line."); return; } if (pti_mode == PTI_FORCE_ON) pti_print_if_secure("force enabled on command line."); if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; setup_force_cpu_cap(X86_FEATURE_PTI); } static int __init pti_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) pti_mode = PTI_FORCE_OFF; else if (!strcmp(arg, "on")) pti_mode = PTI_FORCE_ON; else if (!strcmp(arg, "auto")) pti_mode = PTI_AUTO; else return -EINVAL; return 0; } early_param("pti", pti_parse_cmdline); static int __init pti_parse_cmdline_nopti(char *arg) { pti_mode = PTI_FORCE_OFF; return 0; } early_param("nopti", pti_parse_cmdline_nopti); pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { /* * Changes to the high (kernel) portion of the kernelmode page * tables are not automatically propagated to the usermode tables. * * Users should keep in mind that, unlike the kernelmode tables, * there is no vmalloc_fault equivalent for the usermode tables. * Top-level entries added to init_mm's usermode pgd after boot * will not be automatically propagated to other mms. */ if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW)) return pgd; /* * The user page tables get the full PGD, accessible from * userspace: */ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; /* * If this is normal user memory, make it NX in the kernel * pagetables so that, if we somehow screw up and return to * usermode with the kernel CR3 loaded, we'll get a page fault * instead of allowing user code to execute with the wrong CR3. * * As exceptions, we don't set NX if: * - _PAGE_USER is not set. This could be an executable * EFI runtime mapping or something similar, and the kernel * may execute from it * - we don't have NX support * - we're clearing the PGD (i.e. the new pgd is not present). */ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && (__supported_pte_mask & _PAGE_NX)) pgd.pgd |= _PAGE_NX; /* return the copy of the PGD we want the kernel to use: */ return pgd; } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a P4D on success, or NULL on failure. */ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (address < PAGE_OFFSET) { WARN_ONCE(1, "attempt to walk user address\n"); return NULL; } if (pgd_none(*pgd)) { unsigned long new_p4d_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_p4d_page)) return NULL; set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_leaf(*pgd) != 0); return p4d_offset(pgd, address); } /* * Walk the user copy of the page tables (optionally) trying to allocate * page table pages on the way down. * * Returns a pointer to a PMD on success, or NULL on failure. */ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d; pud_t *pud; p4d = pti_user_pagetable_walk_p4d(address); if (!p4d) return NULL; BUILD_BUG_ON(p4d_leaf(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) return NULL; set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pmd_page)) return NULL; set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); } /* * Walk the shadow copy of the page tables (optionally) trying to allocate * page table pages on the way down. Does not support large pages. * * Note: this is only used when mapping *new* kernel data into the * user/shadow page tables. It is never used for userspace data. * * Returns a pointer to a PTE on success, or NULL on failure. */ static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pmd_t *pmd; pte_t *pte; pmd = pti_user_pagetable_walk_pmd(address); if (!pmd) return NULL; /* Large PMD mapping found */ if (pmd_leaf(*pmd)) { /* Clear the PMD if we hit a large mapping from the first round */ if (late_text) { set_pmd(pmd, __pmd(0)); } else { WARN_ON_ONCE(1); return NULL; } } if (pmd_none(*pmd)) { unsigned long new_pte_page = __get_free_page(gfp); if (!new_pte_page) return NULL; set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); if (pte_flags(*pte) & _PAGE_USER) { WARN_ONCE(1, "attempt to walk to user pte\n"); return NULL; } return pte; } #ifdef CONFIG_X86_VSYSCALL_EMULATION static void __init pti_setup_vsyscall(void) { pte_t *pte, *target_pte; unsigned int level; pte = lookup_address(VSYSCALL_ADDR, &level); if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) return; target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); if (WARN_ON(!target_pte)) return; *target_pte = *pte; set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); } #else static void __init pti_setup_vsyscall(void) { } #endif enum pti_clone_level { PTI_CLONE_PMD, PTI_CLONE_PTE, }; static void pti_clone_pgtable(unsigned long start, unsigned long end, enum pti_clone_level level, bool late_text) { unsigned long addr; /* * Clone the populated PMDs which cover start to end. These PMD areas * can have holes. */ for (addr = start; addr < end;) { pte_t *pte, *target_pte; pmd_t *pmd, *target_pmd; pgd_t *pgd; p4d_t *p4d; pud_t *pud; /* Overflow check */ if (addr < start) break; pgd = pgd_offset_k(addr); if (WARN_ON(pgd_none(*pgd))) return; p4d = p4d_offset(pgd, addr); if (WARN_ON(p4d_none(*p4d))) return; pud = pud_offset(p4d, addr); if (pud_none(*pud)) { WARN_ON_ONCE(addr & ~PUD_MASK); addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { WARN_ON_ONCE(addr & ~PMD_MASK); addr = round_up(addr + 1, PMD_SIZE); continue; } if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; /* * Only clone present PMDs. This ensures only setting * _PAGE_GLOBAL on present PMDs. This should only be * called on well-known addresses anyway, so a non- * present PMD would be a surprise. */ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) return; /* * Setting 'target_pmd' below creates a mapping in both * the user and kernel page tables. It is effectively * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); /* * Copy the PMD. That is, the kernelmode and usermode * tables will share the last-level page tables of this * address range */ *target_pmd = *pmd; addr = round_up(addr + 1, PMD_SIZE); } else if (level == PTI_CLONE_PTE) { /* Walk the page-table down to the pte level */ pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { addr = round_up(addr + 1, PAGE_SIZE); continue; } /* Only clone present PTEs */ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) return; /* Allocate PTE in the user page-table */ target_pte = pti_user_pagetable_walk_pte(addr, late_text); if (WARN_ON(!target_pte)) return; /* Set GLOBAL bit in both PTEs */ if (boot_cpu_has(X86_FEATURE_PGE)) *pte = pte_set_flags(*pte, _PAGE_GLOBAL); /* Clone the PTE */ *target_pte = *pte; addr = round_up(addr + 1, PAGE_SIZE); } else { BUG(); } } } #ifdef CONFIG_X86_64 /* * Clone a single p4d (i.e. a top-level entry on 4-level systems and a * next-level entry on 5-level systems. */ static void __init pti_clone_p4d(unsigned long addr) { p4d_t *kernel_p4d, *user_p4d; pgd_t *kernel_pgd; user_p4d = pti_user_pagetable_walk_p4d(addr); if (!user_p4d) return; kernel_pgd = pgd_offset_k(addr); kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; } /* * Clone the CPU_ENTRY_AREA and associated data into the user space visible * page table. */ static void __init pti_clone_user_shared(void) { unsigned int cpu; pti_clone_p4d(CPU_ENTRY_AREA_BASE); for_each_possible_cpu(cpu) { /* * The SYSCALL64 entry code needs one word of scratch space * in which to spill a register. It lives in the sp2 slot * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); pte_t *target_pte; target_pte = pti_user_pagetable_walk_pte(va, false); if (WARN_ON(!target_pte)) return; *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); } } #else /* CONFIG_X86_64 */ /* * On 32 bit PAE systems with 1GB of Kernel address space there is only * one pgd/p4d for the whole kernel. Cloning that would map the whole * address space into the user page-tables, making PTI useless. So clone * the page-table on the PMD level to prevent that. */ static void __init pti_clone_user_shared(void) { unsigned long start, end; start = CPU_ENTRY_AREA_BASE; end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); } #endif /* CONFIG_X86_64 */ /* * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { #ifdef CONFIG_X86_ESPFIX64 pti_clone_p4d(ESPFIX_BASE_ADDR); #endif } /* * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(bool late) { pti_clone_pgtable((unsigned long) __entry_text_start, (unsigned long) __entry_text_end, PTI_LEVEL_KERNEL_IMAGE, late); } /* * Global pages and PCIDs are both ways to make kernel TLB entries * live longer, reduce TLB misses and improve kernel performance. * But, leaving all kernel text Global makes it potentially accessible * to Meltdown-style attacks which make it trivial to find gadgets or * defeat KASLR. * * Only use global pages when it is really worth it. */ static inline bool pti_kernel_image_global_ok(void) { /* * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) return false; /* * Only do global kernel image for pti=auto. Do the most * secure thing (not global) if pti=on specified. */ if (pti_mode != PTI_AUTO) return false; /* * K8 may not tolerate the cleared _PAGE_RW on the userspace * global kernel image pages. Do the safe thing (disable * global kernel image). This is unlikely to ever be * noticed because PTI is disabled by default on AMD CPUs. */ if (boot_cpu_has(X86_FEATURE_K8)) return false; /* * RANDSTRUCT derives its hardening benefits from the * attacker's lack of knowledge about the layout of kernel * data structures. Keep the kernel image non-global in * cases where RANDSTRUCT is in use to help keep the layout a * secret. */ if (IS_ENABLED(CONFIG_RANDSTRUCT)) return false; return true; } /* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ static void pti_clone_kernel_text(void) { /* * rodata is part of the kernel image and is normally * readable on the filesystem or on the web. But, do not * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); unsigned long end_clone = (unsigned long)__end_rodata_aligned; unsigned long end_global = PFN_ALIGN((unsigned long)_etext); if (!pti_kernel_image_global_ok()) return; pr_debug("mapping partial kernel image into user address space\n"); /* * Note that this will undo _some_ of the work that * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); /* * pti_clone_pgtable() will set the global bit in any PMDs * that it clones, but we also need to get any PTEs in * the last level for areas that are not huge-page-aligned. */ /* Set the global bit for normal non-__init kernel text: */ set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } static void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the * actual length of the kernel. We need to clear * _PAGE_GLOBAL up to a PMD boundary, not just to the end * of the image. */ unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); /* * This clears _PAGE_GLOBAL from the entire kernel image. * pti_clone_kernel_text() map put _PAGE_GLOBAL back for * areas that are mapped to userspace. */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } /* * Initialize kernel page table isolation */ void __init pti_init(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); #ifdef CONFIG_X86_32 /* * We check for X86_FEATURE_PCID here. But the init-code will * clear the feature flag on 32 bit because the feature is not * supported on 32 bit anyway. To print the warning we need to * check with cpuid directly again. */ if (cpuid_ecx(0x1) & BIT(17)) { /* Use printk to work around pr_fmt() */ printk(KERN_WARNING "\n"); printk(KERN_WARNING "************************************************************\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); printk(KERN_WARNING "** **\n"); printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); printk(KERN_WARNING "************************************************************\n"); } #endif pti_clone_user_shared(); /* Undo all global bits from the init pagetables in head_64.S: */ pti_set_kernel_image_nonglobal(); /* Replace some of the global bits just for shared entry text: */ /* * This is very early in boot. Device and Late initcalls can do * modprobe before free_initmem() and mark_readonly(). This * pti_clone_entry_text() allows those user-mode-helpers to function, * but notably the text is still RW. */ pti_clone_entry_text(false); pti_setup_espfix64(); pti_setup_vsyscall(); } /* * Finalize the kernel mappings in the userspace page-table. Some of the * mappings for the kernel image might have changed since pti_init() * cloned them. This is because parts of the kernel image have been * mapped RO and/or NX. These changes need to be cloned again to the * userspace page-table. */ void pti_finalize(void) { if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * This is after free_initmem() (all initcalls are done) and we've done * mark_readonly(). Text is now NX which might've split some PMDs * relative to the early clone. */ pti_clone_entry_text(true); pti_clone_kernel_text(); debug_checkwx_user(); }
51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. */ #include "rxe.h" #include "rxe_hw_counters.h" static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_SENT_PKTS].name = "sent_pkts", [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts", [RXE_CNT_DUP_REQ].name = "duplicate_request", [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request", [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", [RXE_CNT_SEND_ERR].name = "send_err", [RXE_CNT_LINK_DOWNED].name = "link_downed", [RXE_CNT_RDMA_SEND].name = "rdma_sends", [RXE_CNT_RDMA_RECV].name = "rdma_recvs", }; int rxe_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) { struct rxe_dev *dev = to_rdev(ibdev); unsigned int cnt; if (!port || !stats) return -EINVAL; for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++) stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); return ARRAY_SIZE(rxe_counter_descs); } struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS); return rdma_alloc_hw_stats_struct(rxe_counter_descs, ARRAY_SIZE(rxe_counter_descs), RDMA_HW_STATS_DEFAULT_LIFESPAN); }
115 115 1 4 61 4 26 4 1 1 2 25 6 3 26 1 25 1 67 6 3 61 6 3 6 6 6 6 25 26 25 6 22 11 1 2 8 1 1 1 1 1 1 1 7 3 2 2 16 6 10 1 1 10 33 33 35 10 4 3 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 // SPDX-License-Identifier: GPL-2.0 /* * To speed up listener socket lookup, create an array to store all sockets * listening on the same port. This allows a decision to be made after finding * the first socket. An optional BPF program can also be configured for * selecting the socket index from the array of available sockets. */ #include <net/ip.h> #include <net/sock_reuseport.h> #include <linux/bpf.h> #include <linux/idr.h> #include <linux/filter.h> #include <linux/rcupdate.h> #define INIT_SOCKS 128 DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); void reuseport_has_conns_set(struct sock *sk) { struct sock_reuseport *reuse; if (!rcu_access_pointer(sk->sk_reuseport_cb)) return; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (likely(reuse)) reuse->has_conns = 1; spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_has_conns_set); static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) { /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); } static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) { /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); } static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) { if (sk->sk_incoming_cpu >= 0) __reuseport_get_incoming_cpu(reuse); } static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) { if (sk->sk_incoming_cpu >= 0) __reuseport_put_incoming_cpu(reuse); } void reuseport_update_incoming_cpu(struct sock *sk, int val) { struct sock_reuseport *reuse; int old_sk_incoming_cpu; if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { /* Paired with REAE_ONCE() in sk_incoming_cpu_update() * and compute_score(). */ WRITE_ONCE(sk->sk_incoming_cpu, val); return; } spin_lock_bh(&reuseport_lock); /* This must be done under reuseport_lock to avoid a race with * reuseport_grow(), which accesses sk->sk_incoming_cpu without * lock_sock() when detaching a shutdown()ed sk. * * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ old_sk_incoming_cpu = sk->sk_incoming_cpu; WRITE_ONCE(sk->sk_incoming_cpu, val); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuseport_grow() has detached a closed sk. */ if (!reuse) goto out; if (old_sk_incoming_cpu < 0 && val >= 0) __reuseport_get_incoming_cpu(reuse); else if (old_sk_incoming_cpu >= 0 && val < 0) __reuseport_put_incoming_cpu(reuse); out: spin_unlock_bh(&reuseport_lock); } static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) { int left, right; if (!closed) { left = 0; right = reuse->num_socks; } else { left = reuse->max_socks - reuse->num_closed_socks; right = reuse->max_socks; } for (; left < right; left++) if (reuse->socks[left] == sk) return left; return -1; } static void __reuseport_add_sock(struct sock *sk, struct sock_reuseport *reuse) { reuse->socks[reuse->num_socks] = sk; /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ smp_wmb(); reuse->num_socks++; reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_sock(struct sock *sk, struct sock_reuseport *reuse) { int i = reuseport_sock_index(sk, reuse, false); if (i == -1) return false; reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; reuse->num_socks--; reuseport_put_incoming_cpu(sk, reuse); return true; } static void __reuseport_add_closed_sock(struct sock *sk, struct sock_reuseport *reuse) { reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_closed_sock(struct sock *sk, struct sock_reuseport *reuse) { int i = reuseport_sock_index(sk, reuse, true); if (i == -1) return false; reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); reuseport_put_incoming_cpu(sk, reuse); return true; } static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { struct sock_reuseport *reuse; reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); if (!reuse) return NULL; reuse->max_socks = max_socks; RCU_INIT_POINTER(reuse->prog, NULL); return reuse; } int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; int id, ret = 0; /* bh lock used since this function call may precede hlist lock in * soft irq of receive path or setsockopt from process context */ spin_lock_bh(&reuseport_lock); /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { if (reuse->num_closed_socks) { /* sk was shutdown()ed before */ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); goto out; } /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. */ if (bind_inany) reuse->bind_inany = bind_inany; goto out; } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { ret = -ENOMEM; goto out; } id = ida_alloc(&reuseport_ida, GFP_ATOMIC); if (id < 0) { kfree(reuse); ret = id; goto out; } reuse->reuseport_id = id; reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; reuseport_get_incoming_cpu(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: spin_unlock_bh(&reuseport_lock); return ret; } EXPORT_SYMBOL(reuseport_alloc); static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) { struct sock_reuseport *more_reuse; u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; if (more_socks_size > U16_MAX) { if (reuse->num_closed_socks) { /* Make room by removing a closed sk. * The child has already been migrated. * Only reqsk left at this point. */ struct sock *sk; sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); __reuseport_detach_closed_sock(sk, reuse); return reuse; } return NULL; } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) return NULL; more_reuse->num_socks = reuse->num_socks; more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; more_reuse->has_conns = reuse->has_conns; more_reuse->incoming_cpu = reuse->incoming_cpu; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); memcpy(more_reuse->socks + (more_reuse->max_socks - more_reuse->num_closed_socks), reuse->socks + (reuse->max_socks - reuse->num_closed_socks), reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); /* Note: we use kfree_rcu here instead of reuseport_free_rcu so * that reuse and more_reuse can temporarily share a reference * to prog. */ kfree_rcu(reuse, rcu); return more_reuse; } static void reuseport_free_rcu(struct rcu_head *head) { struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); ida_free(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } /** * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. * @sk2: Socket belonging to the existing reuseport group. * @bind_inany: Whether or not the group is bound to a local INANY address. * * May return ENOMEM and not add socket to group under memory pressure. */ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { int err = reuseport_alloc(sk2, bind_inany); if (err) return err; } spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (old_reuse && old_reuse->num_closed_socks) { /* sk was shutdown()ed before */ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); spin_unlock_bh(&reuseport_lock); return err; } if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; } if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); return -ENOMEM; } } __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); if (old_reuse) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } EXPORT_SYMBOL(reuseport_add_sock); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany) { if (old_reuse == reuse) { /* If sk was in the same reuseport group, just pop sk out of * the closed section and push sk into the listening section. */ __reuseport_detach_closed_sock(sk, old_reuse); __reuseport_add_sock(sk, old_reuse); return 0; } if (!reuse) { /* In bind()/listen() path, we cannot carry over the eBPF prog * for the shutdown()ed socket. In setsockopt() path, we should * not change the eBPF prog of listening sockets by attaching a * prog to the shutdown()ed socket. Thus, we will allocate a new * reuseport group and detach sk from the old group. */ int id; reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) return -ENOMEM; id = ida_alloc(&reuseport_ida, GFP_ATOMIC); if (id < 0) { kfree(reuse); return id; } reuse->reuseport_id = id; reuse->bind_inany = bind_inany; } else { /* Move sk from the old group to the new one if * - all the other listeners in the old group were close()d or * shutdown()ed, and then sk2 has listen()ed on the same port * OR * - sk listen()ed without bind() (or with autobind), was * shutdown()ed, and then listen()s on another port which * sk2 listen()s on. */ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) return -ENOMEM; } } __reuseport_detach_closed_sock(sk, old_reuse); __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuseport_grow() has detached a closed sk */ if (!reuse) goto out; /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * * Other bpf map types that work with reuseport, like sockmap, * don't need an explicit callback from here. They override sk * unhash/close ops to remove the sk from the map before we * get to this point. */ bpf_sk_reuseport_detach(sk); rcu_assign_pointer(sk->sk_reuseport_cb, NULL); if (!__reuseport_detach_closed_sock(sk, reuse)) __reuseport_detach_sock(sk, reuse); if (reuse->num_socks + reuse->num_closed_socks == 0) call_rcu(&reuse->rcu, reuseport_free_rcu); out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); void reuseport_stop_listen_sock(struct sock *sk) { if (sk->sk_protocol == IPPROTO_TCP) { struct sock_reuseport *reuse; struct bpf_prog *prog; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. */ bpf_sk_reuseport_detach(sk); __reuseport_detach_sock(sk, reuse); __reuseport_add_closed_sock(sk, reuse); spin_unlock_bh(&reuseport_lock); return; } spin_unlock_bh(&reuseport_lock); } /* Not capable to do migration, detach immediately */ reuseport_detach_sock(sk); } EXPORT_SYMBOL(reuseport_stop_listen_sock); static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) { struct sk_buff *nskb = NULL; u32 index; if (skb_shared(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return NULL; skb = nskb; } /* temporarily advance data past protocol header */ if (!pskb_pull(skb, hdr_len)) { kfree_skb(nskb); return NULL; } index = bpf_prog_run_save_cb(prog, skb); __skb_push(skb, hdr_len); consume_skb(nskb); if (index >= socks) return NULL; return reuse->socks[index]; } static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, u32 hash, u16 num_socks) { struct sock *first_valid_sk = NULL; int i, j; i = j = reciprocal_scale(hash, num_socks); do { struct sock *sk = reuse->socks[i]; if (sk->sk_state != TCP_ESTABLISHED) { /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ if (!READ_ONCE(reuse->incoming_cpu)) return sk; /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) return sk; if (!first_valid_sk) first_valid_sk = sk; } i++; if (i >= num_socks) i = 0; } while (i != j); return first_valid_sk; } /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. * @hash: When no BPF filter is available, use this hash to select. * @skb: skb to run through BPF filter. * @hdr_len: BPF filter expects skb data pointer at payload data. If * the skb does not yet point at the payload, this parameter represents * how far the pointer needs to advance to reach the payload. * Returns a socket that should receive the packet (or NULL on error). */ struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len) { struct sock_reuseport *reuse; struct bpf_prog *prog; struct sock *sk2 = NULL; u16 socks; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); /* if memory allocation failed or add call is not yet complete */ if (!reuse) goto out; prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: rcu_read_unlock(); return sk2; } EXPORT_SYMBOL(reuseport_select_sock); /** * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. * @sk: close()ed or shutdown()ed socket in the group. * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or * NEW_SYN_RECV request socket during 3WHS. * @skb: skb to run through BPF filter. * Returns a socket (with sk_refcnt +1) that should accept the child socket * (or NULL on error). */ struct sock *reuseport_migrate_sock(struct sock *sk, struct sock *migrating_sk, struct sk_buff *skb) { struct sock_reuseport *reuse; struct sock *nsk = NULL; bool allocated = false; struct bpf_prog *prog; u16 socks; u32 hash; rcu_read_lock(); reuse = rcu_dereference(sk->sk_reuseport_cb); if (!reuse) goto out; socks = READ_ONCE(reuse->num_socks); if (unlikely(!socks)) goto failure; /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); hash = migrating_sk->sk_hash; prog = rcu_dereference(reuse->prog); if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) goto select_by_hash; goto failure; } if (!skb) { skb = alloc_skb(0, GFP_ATOMIC); if (!skb) goto failure; allocated = true; } nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); if (allocated) kfree_skb(skb); select_by_hash: if (!nsk) nsk = reuseport_select_sock_by_hash(reuse, hash, socks); if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { nsk = NULL; goto failure; } out: rcu_read_unlock(); return nsk; failure: __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); goto out; } EXPORT_SYMBOL(reuseport_migrate_sock); int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; if (sk_unhashed(sk)) { int err; if (!sk->sk_reuseport) return -EINVAL; err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { /* The socket wasn't bound with SO_REUSEPORT */ return -EINVAL; } spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); sk_reuseport_prog_free(old_prog); return 0; } EXPORT_SYMBOL(reuseport_attach_prog); int reuseport_detach_prog(struct sock *sk) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); /* reuse must be checked after acquiring the reuseport_lock * because reuseport_grow() can detach a closed sk. */ if (!reuse) { spin_unlock_bh(&reuseport_lock); return sk->sk_reuseport ? -ENOENT : -EINVAL; } if (sk_unhashed(sk) && reuse->num_closed_socks) { spin_unlock_bh(&reuseport_lock); return -ENOENT; } old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); if (!old_prog) return -ENOENT; sk_reuseport_prog_free(old_prog); return 0; } EXPORT_SYMBOL(reuseport_detach_prog);
17925 17917 46 2 44 15 6 27 4 4 40 40 40 4 2 1 1 1 1 1 1 2559 1735 115 4097 4094 2355 306 1840 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm_device.c - IPsec device offloading code. * * Copyright (c) 2015 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/errno.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/gso.h> #include <net/xfrm.h> #include <linux/notifier.h> #ifdef CONFIG_XFRM_OFFLOAD static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); skb_reset_mac_len(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header -= x->props.header_len; pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len); } static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len - x->props.enc_hdr_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); int phlen = 0; if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); if (x->sel.family != AF_INET6) { phlen = IPV4_BEET_PHMAXLEN; if (x->outer_mode.family == AF_INET6) phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr); } pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen)); } /* Adjust pointers into the packet when IPsec is done at layer2 */ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { case XFRM_MODE_IPTFS: case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_TRANSPORT: if (x->outer_mode.family == AF_INET) return __xfrm_transport_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_transport_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_BEET: if (x->outer_mode.family == AF_INET) return __xfrm_mode_beet_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_beet_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: break; } } static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); __u32 seq = xo->seq.low; seq += skb_shinfo(skb)->gso_segs; if (unlikely(seq < xo->seq.low)) return true; return false; } struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; unsigned long flags; struct xfrm_state *x; struct softnet_data *sd; struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct net_device *dev = skb->dev; struct sec_path *sp; if (!xo || (xo->flags & XFRM_XMIT)) return skb; if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN) return skb; /* The packet was sent to HW IPsec packet offload engine, * but to wrong device. Drop the packet, so it won't skip * XFRM stack. */ if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) { kfree_skb(skb); dev_core_stats_tx_dropped_inc(dev); return NULL; } /* This skb was already validated on the upper/virtual dev */ if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); local_irq_restore(flags); if (err) { *again = true; return skb; } if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); segs = skb_gso_segment(skb, esp_features); if (IS_ERR(segs)) { kfree_skb(skb); dev_core_stats_tx_dropped_inc(dev); return NULL; } else { consume_skb(skb); skb = segs; } } if (!skb->next) { esp_features |= skb->dev->gso_partial_features; xfrm_outer_mode_prep(x, skb); xo->flags |= XFRM_DEV_RESUME; err = x->type_offload->xmit(x, skb, esp_features); if (err) { if (err == -EINPROGRESS) return NULL; XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); kfree_skb(skb); return NULL; } skb_push(skb, skb->data - skb_mac_header(skb)); return skb; } skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; xfrm_outer_mode_prep(x, skb2); err = x->type_offload->xmit(x, skb2, esp_features); if (!err) { skb2->next = nskb; } else if (err != -EINPROGRESS) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); skb2->next = nskb; kfree_skb_list(skb2); return NULL; } else { if (skb == skb2) skb = nskb; else pskb->next = nskb; continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); pskb = skb2; } return skb; } EXPORT_SYMBOL_GPL(validate_xmit_xfrm); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack) { int err; struct dst_entry *dst; struct net_device *dev; struct xfrm_dev_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; bool is_packet_offload; if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) || (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) { NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction"); return -EINVAL; } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support TFC padding. */ if (x->tfcpad) { NL_SET_ERR_MSG(extack, "TFC padding can't be offloaded"); return -EINVAL; } dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { struct xfrm_dst_lookup_params params; if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { saddr = &x->props.saddr; daddr = &x->id.daddr; } else { saddr = &x->id.daddr; daddr = &x->props.saddr; } memset(&params, 0, sizeof(params)); params.net = net; params.saddr = saddr; params.daddr = daddr; params.mark = xfrm_smark_get(0, x); dst = __xfrm_dst_lookup(x->props.family, &params); if (IS_ERR(dst)) return (is_packet_offload) ? -EINVAL : 0; dev = dst->dev; dev_hold(dev); dst_release(dst); } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { xso->dev = NULL; dev_put(dev); return (is_packet_offload) ? -EINVAL : 0; } if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN && !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN"); xso->dev = NULL; dev_put(dev); return -EINVAL; } xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); return -EINVAL; } xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; if (xuo->flags & XFRM_OFFLOAD_INBOUND) xso->dir = XFRM_DEV_OFFLOAD_IN; else xso->dir = XFRM_DEV_OFFLOAD_OUT; if (is_packet_offload) xso->type = XFRM_DEV_OFFLOAD_PACKET; else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); if (err) { xso->dev = NULL; xso->dir = 0; xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xfrm_unset_type_offload(x); /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. */ if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } } return 0; } EXPORT_SYMBOL_GPL(xfrm_dev_state_add); int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack) { struct xfrm_dev_offload *xdo = &xp->xdo; struct net_device *dev; int err; if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) { /* We support only packet offload mode and it means * that user must set XFRM_OFFLOAD_PACKET bit. */ NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } dev = dev_get_by_index(net, xuo->ifindex); if (!dev) return -EINVAL; if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) { xdo->dev = NULL; dev_put(dev); NL_SET_ERR_MSG(extack, "Policy offload is not supported"); return -EINVAL; } xdo->dev = dev; netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); xdo->real_dev = dev; xdo->type = XFRM_DEV_OFFLOAD_PACKET; switch (dir) { case XFRM_POLICY_IN: xdo->dir = XFRM_DEV_OFFLOAD_IN; break; case XFRM_POLICY_OUT: xdo->dir = XFRM_DEV_OFFLOAD_OUT; break; case XFRM_POLICY_FWD: xdo->dir = XFRM_DEV_OFFLOAD_FWD; break; default: xdo->dev = NULL; netdev_put(dev, &xdo->dev_tracker); NL_SET_ERR_MSG(extack, "Unrecognized offload direction"); return -EINVAL; } err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; xdo->real_dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy"); return err; } return 0; } EXPORT_SYMBOL_GPL(xfrm_dev_policy_add); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { int mtu; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; bool check_tunnel_size; if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) return false; if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) goto ok; } return false; ok: check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; switch (x->props.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) return false; if (check_tunnel_size && xfrm4_tunnel_check_size(skb)) return false; break; case AF_INET6: /* Check for IPv6 extensions */ if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) return false; if (check_tunnel_size && xfrm6_tunnel_check_size(skb)) return false; break; default: break; } if (dev->xfrmdev_ops->xdo_dev_offload_ok) return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); void xfrm_dev_resume(struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret = NETDEV_TX_BUSY; struct netdev_queue *txq; struct softnet_data *sd; unsigned long flags; rcu_read_lock(); txq = netdev_core_pick_tx(dev, skb, NULL); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); if (!dev_xmit_complete(ret)) { local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); skb_queue_tail(&sd->xfrm_backlog, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xfrm_dev_resume); void xfrm_dev_backlog(struct softnet_data *sd) { struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog; struct sk_buff_head list; struct sk_buff *skb; if (skb_queue_empty(xfrm_backlog)) return; __skb_queue_head_init(&list); spin_lock(&xfrm_backlog->lock); skb_queue_splice_init(xfrm_backlog, &list); spin_unlock(&xfrm_backlog->lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); xfrm_dev_resume(skb); } } #endif static int xfrm_api_check(struct net_device *dev) { #ifdef CONFIG_XFRM_OFFLOAD if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && !(dev->features & NETIF_F_HW_ESP)) return NOTIFY_BAD; if ((dev->features & NETIF_F_HW_ESP) && (!(dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_add && dev->xfrmdev_ops->xdo_dev_state_delete))) return NOTIFY_BAD; #else if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM)) return NOTIFY_BAD; #endif return NOTIFY_DONE; } static int xfrm_dev_down(struct net_device *dev) { if (dev->features & NETIF_F_HW_ESP) { xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_dev_policy_flush(dev_net(dev), dev, true); } return NOTIFY_DONE; } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: return xfrm_api_check(dev); case NETDEV_FEAT_CHANGE: return xfrm_api_check(dev); case NETDEV_DOWN: case NETDEV_UNREGISTER: return xfrm_dev_down(dev); } return NOTIFY_DONE; } static struct notifier_block xfrm_dev_notifier = { .notifier_call = xfrm_dev_event, }; void __init xfrm_dev_init(void) { register_netdevice_notifier(&xfrm_dev_notifier); }
607 1428 725 965 599 1 6 13 268 44 42 1 105 22 86 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; struct rcu_head rcu; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { #if IS_ENABLED(CONFIG_IP_DCCP) return sk->sk_prot->h.hashinfo ? : sock_net(sk)->ipv4.tcp_death_row.hashinfo; #else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; #endif } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) { kfree(h->lhash2); h->lhash2 = NULL; } static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(const struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **, bool rcu_lookup, u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */
116 46 105 84 57 44 44 51 57 94 14 162 162 162 161 15 162 162 162 62 76 43 3 76 10 141 49 154 3 3 2 1 2 7 7 17 37 17 37 4 25 28 199 165 147 15 3 162 3 28 139 138 1 105 193 7 70 60 10 11 106 189 189 152 1 152 61 54 54 7 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to mapping data to requests */ #include <linux/kernel.h> #include <linux/sched/task_stack.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/uio.h> #include "blk.h" struct bio_map_data { bool is_our_pages : 1; bool is_null_mapped : 1; struct iov_iter iter; struct iovec iov[]; }; static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, gfp_t gfp_mask) { struct bio_map_data *bmd; if (data->nr_segs > UIO_MAXIOV) return NULL; bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; bmd->iter = *data; if (iter_is_iovec(data)) { memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); bmd->iter.__iov = bmd->iov; } return bmd; } /** * bio_copy_from_iter - copy all pages from iov_iter to bio * @bio: The &struct bio which describes the I/O as destination * @iter: iov_iter as source * * Copy all pages from iov_iter to bio. * Returns 0 on success, or error on failure. */ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_from_iter(bvec->bv_page, bvec->bv_offset, bvec->bv_len, iter); if (!iov_iter_count(iter)) break; if (ret < bvec->bv_len) return -EFAULT; } return 0; } /** * bio_copy_to_iter - copy all pages from bio to iov_iter * @bio: The &struct bio which describes the I/O as source * @iter: iov_iter as destination * * Copy all pages from bio to iov_iter. * Returns 0 on success, or error on failure. */ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_to_iter(bvec->bv_page, bvec->bv_offset, bvec->bv_len, &iter); if (!iov_iter_count(&iter)) break; if (ret < bvec->bv_len) return -EFAULT; } return 0; } /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated * * Free pages allocated from bio_copy_user_iov() and write back data * to user space in case of a read. */ static int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; int ret = 0; if (!bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free * and return -EINTR so user space doesn't expect any data. */ if (!current->mm) ret = -EINTR; else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); } kfree(bmd); return ret; } static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, struct iov_iter *iter, gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; struct bio *bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; bmd = bio_alloc_map_data(iter, gfp_mask); if (!bmd) return -ENOMEM; /* * We need to do a deep copy of the iov_iter including the iovecs. * The caller provided iov might point to an on-stack or otherwise * shortlived one. */ bmd->is_our_pages = !map_data; bmd->is_null_mapped = (map_data && map_data->null_mapped); nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { unsigned int bytes = PAGE_SIZE; bytes -= offset; if (bytes > len) bytes = len; if (map_data) { if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; goto cleanup; } page = map_data->pages[i / nr_pages]; page += (i % nr_pages); i++; } else { page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; goto cleanup; } } if (bio_add_page(bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; } len -= bytes; offset = 0; } if (map_data) map_data->offset += bio->bi_iter.bi_size; /* * success */ if (iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) { ret = bio_copy_from_iter(bio, iter); if (ret) goto cleanup; } else if (map_data && map_data->from_user) { struct iov_iter iter2 = *iter; /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */ iter2.data_source = ITER_SOURCE; ret = bio_copy_from_iter(bio, &iter2); if (ret) goto cleanup; } else { if (bmd->is_our_pages) zero_fill_bio(bio); iov_iter_advance(iter, bio->bi_iter.bi_size); } bio->bi_private = bmd; ret = blk_rq_append_bio(rq, bio); if (ret) goto cleanup; return 0; cleanup: if (!map_data) bio_free_pages(bio); bio_uninit(bio); kfree(bio); out_bmd: kfree(bmd); return ret; } static void blk_mq_map_bio_put(struct bio *bio) { if (bio->bi_opf & REQ_ALLOC_CACHE) { bio_put(bio); } else { bio_uninit(bio); kfree(bio); } } static struct bio *blk_rq_map_bio_alloc(struct request *rq, unsigned int nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; } else { bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); } return bio; } static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; if (!iov_iter_count(iter)) return -EINVAL; bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); if (!bio) return -ENOMEM; ret = bio_iov_iter_get_pages(bio, iter); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); if (ret) goto out_release; return 0; out_release: bio_release_pages(bio, false); out_put: blk_mq_map_bio_put(bio); return ret; } static void bio_invalidate_vmalloc_pages(struct bio *bio) { #ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; for (i = 0; i < bio->bi_vcnt; i++) len += bio->bi_io_vec[i].bv_len; invalidate_kernel_vmap_range(bio->bi_private, len); } #endif } static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); bio_uninit(bio); kfree(bio); } /** * bio_map_kern - map kernel address into bio * @q: the struct request_queue for the bio * @data: pointer to buffer to map * @len: length in bytes * @gfp_mask: allocation flags for bio allocation * * Map the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ static struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; bool is_vmalloc = is_vmalloc_addr(data); struct page *page; int offset, i; struct bio *bio; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); bio->bi_private = data; } offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; if (len <= 0) break; if (bytes > len) bytes = len; if (!is_vmalloc) page = virt_to_page(data); else page = vmalloc_to_page(data); if (bio_add_page(bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_uninit(bio); kfree(bio); return ERR_PTR(-EINVAL); } data += bytes; len -= bytes; offset = 0; } bio->bi_end_io = bio_map_kern_endio; return bio; } static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); bio_uninit(bio); kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) { char *p = bio->bi_private; struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { memcpy_from_bvec(p, bvec); p += bvec->bv_len; } bio_copy_kern_endio(bio); } /** * bio_copy_kern - copy kernel address into bio * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes * @gfp_mask: allocation flags for bio and page allocation * @reading: data direction is READ * * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ static struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; struct bio *bio; void *p = data; int nr_pages = 0; /* * Overflow, abort */ if (end < start) return ERR_PTR(-EINVAL); nr_pages = end - start; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; unsigned int bytes = PAGE_SIZE; if (bytes > len) bytes = len; page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; if (!reading) memcpy(page_address(page), p, bytes); if (bio_add_page(bio, page, bytes, 0) < bytes) break; len -= bytes; p += bytes; } if (reading) { bio->bi_end_io = bio_copy_kern_endio_read; bio->bi_private = data; } else { bio->bi_end_io = bio_copy_kern_endio; } return bio; cleanup: bio_free_pages(bio); bio_uninit(bio); kfree(bio); return ERR_PTR(-ENOMEM); } /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { const struct queue_limits *lim = &rq->q->limits; unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; unsigned int nr_segs = 0; int ret; /* check that the data layout matches the hardware restrictions */ ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) ret = -EREMOTEIO; return ret; } if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; rq->biotail->bi_next = bio; rq->biotail = bio; rq->__data_len += bio->bi_iter.bi_size; bio_crypt_free_ctx(bio); return 0; } rq->nr_phys_segments = nr_segs; rq->bio = rq->biotail = bio; rq->__data_len = bio->bi_iter.bi_size; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT; struct bio *bio; int ret; if (!iov_iter_count(iter) || iov_iter_count(iter) > max_bytes) return -EINVAL; /* reuse the bvecs from the iterator instead of allocating new ones */ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); if (!bio) return -ENOMEM; bio_iov_bvec_set(bio, iter); ret = blk_rq_append_bio(rq, bio); if (ret) blk_mq_map_bio_put(bio); return ret; } /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: * Data will be mapped directly for zero copy I/O, if possible. Otherwise * a kernel bounce buffer is used. * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false, map_bvec = false; unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; if (map_data) copy = true; else if (blk_queue_may_bounce(q)) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (iov_iter_is_bvec(iter)) map_bvec = true; else if (!user_backed_iter(iter)) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); if (map_bvec) { ret = blk_rq_map_user_bvec(rq, iter); if (!ret) return 0; if (ret != -EREMOTEIO) goto fail; /* fall back to copying the data on limits mismatches */ copy = true; } i = *iter; do { if (copy) ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); else ret = bio_map_user_iov(rq, &i, gfp_mask); if (ret) { if (ret == -EREMOTEIO) ret = -EINVAL; goto unmap_rq; } if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); return 0; unmap_rq: blk_rq_unmap_user(bio); fail: rq->bio = NULL; return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { struct iov_iter i; int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); } EXPORT_SYMBOL(blk_rq_map_user); int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, bool vec, int iov_count, bool check_iter_count, int rw) { int ret = 0; if (vec) { struct iovec fast_iov[UIO_FASTIOV]; struct iovec *iov = fast_iov; struct iov_iter iter; ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, UIO_FASTIOV, &iov, &iter); if (ret < 0) return ret; if (iov_count) { /* SG_IO howto says that the shorter of the two wins */ iov_iter_truncate(&iter, buf_len); if (check_iter_count && !iov_iter_count(&iter)) { kfree(iov); return -EINVAL; } } ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, gfp_mask); kfree(iov); } else if (buf_len) { ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, gfp_mask); } return ret; } EXPORT_SYMBOL(blk_rq_map_user_io); /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list * * Description: * Unmap a rq previously mapped by blk_rq_map_user(). The caller must * supply the original rq->bio from the blk_rq_map_user() return, since * the I/O completion may have changed rq->bio. */ int blk_rq_unmap_user(struct bio *bio) { struct bio *next_bio; int ret = 0, ret2; while (bio) { if (bio->bi_private) { ret2 = bio_uncopy_user(bio); if (ret2 && !ret) ret = ret2; } else { bio_release_pages(bio, bio_data_dir(bio) == READ); } if (bio_integrity(bio)) bio_integrity_unmap_user(bio); next_bio = bio; bio = bio->bi_next; blk_mq_map_bio_put(next_bio); } return ret; } EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data * @gfp_mask: memory allocation flags * * Description: * Data will be mapped directly if possible. Otherwise a bounce * buffer is used. Can be called multiple times to append multiple * buffers. */ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) return -EINVAL; if (!len || !kbuf) return -EINVAL; if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { bio_uninit(bio); kfree(bio); } return ret; } EXPORT_SYMBOL(blk_rq_map_kern);
226 2449 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in a * new subdirectory with this name. Additionally when a * group is named, @is_visible and @is_bin_visible may * return SYSFS_GROUP_INVISIBLE to control visibility of * the directory itself. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned * value will replace static permissions defined in struct * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this * callback to specify separate _group_visible() and * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC (and the * visibility flags for named groups) are accepted. Must * return 0 if a binary attribute is not visible. The * returned value will replace static permissions defined * in struct bin_attribute. If @is_visible is not set, Use * SYSFS_GROUP_VISIBLE() when assigning this callback to * specify separate _group_visible() and _attr_visible() * handlers. * @bin_size: * Optional: Function to return the size of a binary attribute * of the group. Will be called repeatedly for each binary * attribute in the group. Overwrites the size field embedded * inside the attribute itself. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); struct attribute **attrs; union { struct bin_attribute **bin_attrs; const struct bin_attribute *const *bin_attrs_new; }; }; #define SYSFS_PREALLOC 010000 #define SYSFS_GROUP_INVISIBLE 020000 /* * DEFINE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with the assignment of ".is_visible = * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory * associated with a named attribute_group to optionally be hidden. * This allows for static declaration of attribute_groups, and the * simplification of attribute visibility lifetime that implies, * without polluting sysfs with empty attribute directories. * Ex. * * static umode_t example_attr_visible(struct kobject *kobj, * struct attribute *attr, int n) * { * if (example_attr_condition) * return 0; * else if (ro_attr_condition) * return 0444; * return a->mode; * } * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; * * Note that it expects <name>_attr_visible and <name>_group_visible to * be defined. For cases where individual attributes do not need * separate visibility consideration, only entire group visibility at * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } /* * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does * not require the implementation of a per-attribute visibility * callback. * Ex. * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; */ #define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary * attributes, the group visibility is determined by the function * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } #define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs_new = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) typedef ssize_t __sysfs_bin_rw_handler_new(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _Generic(_read, \ __sysfs_bin_rw_handler_new * : NULL, \ default : _read \ ), \ .read_new = _Generic(_read, \ __sysfs_bin_rw_handler_new * : _read, \ default : NULL \ ), \ .write = _Generic(_write, \ __sysfs_bin_rw_handler_new * : NULL, \ default : _write \ ), \ .write_new = _Generic(_write, \ __sysfs_bin_rw_handler_new * : _write, \ default : NULL \ ), \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) \ __BIN_ATTR(_name, 0444, _name##_read, NULL, _size) #define __BIN_ATTR_WO(_name, _size) \ __BIN_ATTR(_name, 0200, NULL, _name##_write, _size) #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) \ __BIN_ATTR(_name, 0400, _name##_read, NULL, _size) #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) #define __BIN_ATTR_SIMPLE_RO(_name, _mode) \ __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0) #define BIN_ATTR_SIMPLE_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444) #define BIN_ATTR_SIMPLE_ADMIN_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
16 3 13 3 6 3 1 5 5 1 4 4 1 2 2 2 2 3 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* DataCenter TCP (DCTCP) congestion control. * * http://simula.stanford.edu/~alizade/Site/DCTCP.html * * This is an implementation of DCTCP over Reno, an enhancement to the * TCP congestion control algorithm designed for data centers. DCTCP * leverages Explicit Congestion Notification (ECN) in the network to * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet * the following three data center transport requirements: * * - High burst tolerance (incast due to partition/aggregate) * - Low latency (short flows, queries) * - High throughput (continuous data updates, large file transfers) * with commodity shallow buffered switches * * The algorithm is described in detail in the following two papers: * * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: * "Data Center TCP (DCTCP)", Data Center Networks session * Proc. ACM SIGCOMM, New Delhi, 2010. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf * * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: * "Analysis of DCTCP: Stability, Convergence, and Fairness" * Proc. ACM SIGMETRICS, San Jose, 2011. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf * * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. * * Authors: * * Daniel Borkmann <dborkman@redhat.com> * Florian Westphal <fw@strlen.de> * Glenn Judd <glenn.judd@morganstanley.com> */ #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/mm.h> #include <net/tcp.h> #include <linux/inet_diag.h> #include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U struct dctcp { u32 old_delivered; u32 old_delivered_ce; u32 prior_rcv_nxt; u32 dctcp_alpha; u32 next_seq; u32 ce_state; u32 loss_cwnd; struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, 0, 10); } static const struct kernel_param_ops dctcp_shift_g_ops = { .set = dctcp_shift_g_set, .get = param_get_uint, }; module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644); MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; module_param(dctcp_alpha_on_init, uint, 0644); MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); static struct tcp_congestion_ops dctcp_reno; static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; ca->old_delivered = tp->delivered; ca->old_delivered_ce = tp->delivered_ce; } __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_mode_any(tp) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); tcp_plb_init(sk, &ca->plb); return; } /* No ECN support? Fall back to Reno. Also need to clear * ECT from sk since it is set during 3WHS for DCTCP. */ inet_csk(sk)->icsk_ca_ops = &dctcp_reno; INET_ECN_dontxmit(sk); } __bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); } __bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; u32 ce_ratio = 0; if (delivered > 0) { /* dctcp_alpha keeps EWMA of fraction of ECN marked * packets. Because of EWMA smoothing, PLB reaction can * be slow so we use ce_ratio which is an instantaneous * measure of congestion. ce_ratio is the fraction of * ECN marked packets in the previous RTT. */ if (delivered_ce > 0) ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); tcp_plb_check_rehash(sk, &ca->plb); } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. */ delivered_ce <<= (10 - dctcp_shift_g); delivered_ce /= max(1U, delivered); alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); } /* dctcp_alpha can be read from dctcp_get_info() without * synchro, so we ask compiler to not use dctcp_alpha * as a temporary variable in prior operations. */ WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } static void dctcp_react_to_loss(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); } __bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Recovery && new_state != inet_csk(sk)->icsk_ca_state) dctcp_react_to_loss(sk); /* We handle RTO in dctcp_cwnd_event to ensure that we perform only * one loss-adjustment per RTT. */ } __bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: case CA_EVENT_ECN_NO_CE: dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; case CA_EVENT_TX_START: tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ break; default: /* Don't care for the rest. */ break; } } static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Fill it also in case of VEGASINFO due to req struct limits. * We can still correctly retrieve it later. */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_alpha = ca->dctcp_alpha; info->dctcp.dctcp_ab_ecn = tp->mss_cache * (tp->delivered_ce - ca->old_delivered_ce); info->dctcp.dctcp_ab_tot = tp->mss_cache * (tp->delivered - ca->old_delivered); } *attr = INET_DIAG_DCTCPINFO; return sizeof(info->dctcp); } return 0; } __bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), ca->loss_cwnd); } static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, .owner = THIS_MODULE, .name = "dctcp", }; static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_dctcp_check_kfunc_ids, }; static int __init dctcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&dctcp); } static void __exit dctcp_unregister(void) { tcp_unregister_congestion_control(&dctcp); } module_init(dctcp_register); module_exit(dctcp_unregister); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
102 5 1 1 2 1 1 1 3 1 3 1 2 2 4 4 4 56 56 56 55 135 791 851 852 4 4 5 8 8 549 549 791 3 791 791 5 790 3 791 732 102 169 741 5 741 102 102 790 789 709 790 773 23 790 549 462 87 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 // SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org> * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. */ #define pr_fmt(fmt) "debugfs: " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/security.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret) return ret; } return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_dir_inode_operations = { .lookup = simple_lookup, .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = debugfs_setattr, }; static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct debugfs_fs_info { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_source, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), fsparam_string ("source", Opt_source), {} }; static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct debugfs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); if (opt < 0) { /* * We might like to report bad mount options here; but * traditionally debugfs has ignored all mount options */ if (opt == -ENOPARAM) return 0; return opt; } switch (opt) { case Opt_uid: opts->uid = result.uid; break; case Opt_gid: opts->gid = result.gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; case Opt_source: if (fc->source) return invalfc(fc, "Multiple sources specified"); fc->source = param->string; param->string = NULL; break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options */ } opts->opts |= BIT(opt); return 0; } static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || fsi->opts & BIT(Opt_mode)) { inode->i_mode &= ~S_IALLUGO; inode->i_mode |= fsi->mode; } if (!remount || fsi->opts & BIT(Opt_uid)) inode->i_uid = fsi->uid; if (!remount || fsi->opts & BIT(Opt_gid)) inode->i_gid = fsi->gid; } static void debugfs_apply_options(struct super_block *sb) { _debugfs_apply_options(sb, false); } static void debugfs_apply_options_remount(struct super_block *sb) { _debugfs_apply_options(sb, true); } static int debugfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct debugfs_fs_info *sb_opts = sb->s_fs_info; struct debugfs_fs_info *new_opts = fc->s_fs_info; sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; debugfs_apply_options_remount(sb); return 0; } static int debugfs_show_options(struct seq_file *m, struct dentry *root) { struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, fsi->uid)); if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, fsi->gid)); if (fsi->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mode); return 0; } static struct kmem_cache *debugfs_inode_cachep __ro_after_init; static void init_once(void *foo) { struct debugfs_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static struct inode *debugfs_alloc_inode(struct super_block *sb) { struct debugfs_inode_info *info; info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void debugfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode)); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .show_options = debugfs_show_options, .alloc_inode = debugfs_alloc_inode, .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) { struct debugfs_fsdata *fsd = dentry->d_fsdata; if (fsd) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { struct inode *inode = path->dentry->d_inode; return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private); } static const struct dentry_operations debugfs_dops = { .d_delete = always_delete_dentry, .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr debug_files[] = {{""}}; int err; err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); if (err) return err; sb->s_op = &debugfs_super_operations; sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); return 0; } static int debugfs_get_tree(struct fs_context *fc) { if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return -EPERM; return get_tree_single(fc, debugfs_fill_super); } static void debugfs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations debugfs_context_ops = { .free = debugfs_free_fc, .parse_param = debugfs_parse_param, .get_tree = debugfs_get_tree, .reconfigure = debugfs_reconfigure, }; static int debugfs_init_fs_context(struct fs_context *fc) { struct debugfs_fs_info *fsi; fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mode = DEBUGFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &debugfs_context_ops; return 0; } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", .init_fs_context = debugfs_init_fs_context, .parameters = debugfs_param_specs, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("debugfs"); /** * debugfs_lookup() - look up an existing debugfs file * @name: a pointer to a string containing the name of the file to look up. * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file * doesn't exist or an error occurs, %NULL will be returned. The returned * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); static struct dentry *start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); if (!debugfs_initialized()) return ERR_PTR(-ENOENT); pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) return parent; error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) { pr_err("Unable to pin filesystem for file '%s'\n", name); return ERR_PTR(error); } /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", name, parent->d_name.name); else pr_err("File '%s' in directory '%s' already present!\n", name, parent->d_name.name); dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct file_operations *proxy_fops, const void *real_fops) { struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = start_creating(name, parent); if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); return failed_creating(dentry); } inode->i_mode = mode; inode->i_private = data; inode->i_op = &debugfs_file_inode_operations; if (!real_fops) proxy_fops = &debugfs_noop_file_operations; inode->i_fop = proxy_fops; DEBUGFS_I(inode)->raw = real_fops; DEBUGFS_I(inode)->aux = aux; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, aux, &debugfs_full_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_full); struct dentry *debugfs_create_file_short(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, const struct debugfs_short_fops *fops) { return __debugfs_create_file(name, mode, parent, data, aux, &debugfs_full_short_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_short); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * debugfs_create_file_unsafe() is completely analogous to * debugfs_create_file(), the only difference being that the fops * handed it will not get protected against file removals by the * debugfs core. * * It is your responsibility to protect your struct file_operation * methods against file removals by means of debugfs_file_get() * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and * thus, may be used here. */ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, NULL, &debugfs_open_proxy_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @file_size: initial file size * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) */ void debugfs_create_file_size(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); /** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { struct dentry *dentry = start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); return failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_create_automount - create automount point in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @f: function to be called when pathname resolution steps on that one. * @data: opaque argument to pass to f(). * * @f should return what ->d_automount() would. */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; DEBUGFS_I(inode)->automount = f; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); /** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the * symbolic link. * * This function creates a symbolic link with the given name in debugfs that * links to the given target path. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { struct dentry *dentry; struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); static void __debugfs_file_removed(struct dentry *dentry) { struct debugfs_fsdata *fsd; /* * Paired with the closing smp_mb() implied by a successful * cmpxchg() in debugfs_file_get(): either * debugfs_file_get() must see a dead dentry or we must see a * debugfs_fsdata instance at ->d_fsdata here (or both). */ smp_mb(); fsd = READ_ONCE(dentry->d_fsdata); if (!fsd) return; /* if this was the last reference, we're done */ if (refcount_dec_and_test(&fsd->active_users)) return; /* * If there's still a reference, the code that obtained it can * be in different states: * - The common case of not using cancellations, or already * after debugfs_leave_cancellation(), where we just need * to wait for debugfs_file_put() which signals the completion; * - inside a cancellation section, i.e. between * debugfs_enter_cancellation() and debugfs_leave_cancellation(), * in which case we need to trigger the ->cancel() function, * and then wait for debugfs_file_put() just like in the * previous case; * - before debugfs_enter_cancellation() (but obviously after * debugfs_file_get()), in which case we may not see the * cancellation in the list on the first round of the loop, * but debugfs_enter_cancellation() signals the completion * after adding it, so this code gets woken up to call the * ->cancel() function. */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; /* * Lock the cancellations. Note that the cancellations * structs are meant to be on the stack, so we need to * ensure we either use them here or don't touch them, * and debugfs_leave_cancellation() will wait for this * to be finished processing before exiting one. It may * of course win and remove the cancellation, but then * chances are we never even got into this bit, we only * do if the refcount isn't zero already. */ mutex_lock(&fsd->cancellations_mtx); while ((c = list_first_entry_or_null(&fsd->cancellations, typeof(*c), list))) { list_del_init(&c->list); c->cancel(dentry, c->cancel_data); } mutex_unlock(&fsd->cancellations_mtx); wait_for_completion(&fsd->active_users_drained); } } static void remove_one(struct dentry *victim) { if (d_is_reg(victim)) __debugfs_file_removed(victim); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } /** * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function * (like debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ void debugfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it * @name: a pointer to a string containing the name of the item to look up. * @parent: a pointer to the parent dentry of the item. * * This is the equlivant of doing something like * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting * handled for the directory being looked up. */ void debugfs_lookup_and_remove(const char *name, struct dentry *parent) { struct dentry *dentry; dentry = debugfs_lookup(name, parent); if (!dentry) return; debugfs_remove(dentry); dput(dentry); } EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); /** * debugfs_change_name - rename a file/directory in the debugfs filesystem * @dentry: dentry of an object to be renamed. * @fmt: format for new name * * This function renames a file/directory in debugfs. The target must not * exist for rename to succeed. * * This function will return 0 on success and -E... on failure. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...) { int error = 0; const char *new_name; struct name_snapshot old_name; struct dentry *parent, *target; struct inode *dir; va_list ap; if (IS_ERR_OR_NULL(dentry)) return 0; va_start(ap, fmt); new_name = kvasprintf_const(GFP_KERNEL, fmt, ap); va_end(ap); if (!new_name) return -ENOMEM; parent = dget_parent(dentry); dir = d_inode(parent); inode_lock(dir); take_dentry_name_snapshot(&old_name, dentry); if (WARN_ON_ONCE(dentry->d_parent != parent)) { error = -EINVAL; goto out; } if (strcmp(old_name.name.name, new_name) == 0) goto out; target = lookup_one_len(new_name, parent, strlen(new_name)); if (IS_ERR(target)) { error = PTR_ERR(target); goto out; } if (d_really_is_positive(target)) { dput(target); error = -EINVAL; goto out; } simple_rename_timestamp(dir, dentry, dir, target); d_move(dentry, target); dput(target); fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry); out: release_dentry_name_snapshot(&old_name); inode_unlock(dir); dput(parent); kfree_const(new_name); return error; } EXPORT_SYMBOL_GPL(debugfs_change_name); /** * debugfs_initialized - Tells whether debugfs has been registered */ bool debugfs_initialized(void) { return debugfs_registered; } EXPORT_SYMBOL_GPL(debugfs_initialized); static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; else if (!strcmp(str, "no-mount")) debugfs_allow = DEBUGFS_ALLOW_API; else if (!strcmp(str, "off")) debugfs_allow = 0; } return 0; } early_param("debugfs", debugfs_kernel); static int __init debugfs_init(void) { int retval; if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); if (retval) return retval; debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache", sizeof(struct debugfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (debugfs_inode_cachep == NULL) { sysfs_remove_mount_point(kernel_kobj, "debug"); return -ENOMEM; } retval = register_filesystem(&debug_fs_type); if (retval) { // Really not going to happen sysfs_remove_mount_point(kernel_kobj, "debug"); kmem_cache_destroy(debugfs_inode_cachep); return retval; } debugfs_registered = true; return 0; } core_initcall(debugfs_init);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netclassid_cgroup.c Classid Cgroupfs Handling * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> #include <linux/sched/task.h> #include <net/cls_cgroup.h> #include <net/sock.h> static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, rcu_read_lock_bh_held())); } EXPORT_SYMBOL_GPL(task_cls_state); static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); return &cs->css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_cls_state *cs = css_cls_state(css); struct cgroup_cls_state *parent = css_cls_state(css->parent); if (parent) cs->classid = parent->classid; return 0; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css_cls_state(css)); } /* * To avoid freezing of sockets creation for tasks with big number of threads * and opened sockets lets release file_lock every 1000 iterated descriptors. * New sockets will already have been created with new classid. */ struct update_classid_context { u32 classid; unsigned int batch; }; #define UPDATE_CLASSID_BATCH 1000 static int update_classid_sock(const void *v, struct file *file, unsigned int n) { struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; } return 0; } static void update_classid_task(struct task_struct *p, u32 classid) { struct update_classid_context ctx = { .classid = classid, .batch = UPDATE_CLASSID_BATCH }; unsigned int fd = 0; /* Only update the leader task, when many threads in this task, * so it can avoid the useless traversal. */ if (p != p->group_leader) return; do { task_lock(p); fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); task_unlock(p); cond_resched(); } while (fd); } static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { update_classid_task(p, css_cls_state(css)->classid); } } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { return css_cls_state(css)->classid; } static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); struct css_task_iter it; struct task_struct *p; cs->classid = (u32)value; css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) update_classid_task(p, cs->classid); css_task_iter_end(&it); return 0; } static struct cftype ss_files[] = { { .name = "classid", .read_u64 = read_classid, .write_u64 = write_classid, }, { } /* terminate */ }; struct cgroup_subsys net_cls_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, .legacy_cftypes = ss_files, };
18 3 15 15 3 5 6 1 1 1 6 6 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* * * Generic part shared by ipv4 and ipv6 backends. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <linux/in.h> #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; u8 dreg; u8 dir; u8 spnum; u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int len = 0; u32 spnum = 0; u8 dir; if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: len = sizeof(u32); break; case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: len = sizeof(struct in_addr); break; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: len = sizeof(struct in6_addr); break; default: return -EINVAL; } dir = nla_get_u8(tb[NFTA_XFRM_DIR]); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: priv->dir = dir; break; default: return -EINVAL; } if (tb[NFTA_XFRM_SPNUM]) spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); if (spnum >= XFRM_MAX_DEPTH) return -ERANGE; priv->spnum = spnum; priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current * state does have a valid address (BEET, TUNNEL). */ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) { switch (k) { case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: if (family == NFPROTO_IPV4) break; return false; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: if (family == NFPROTO_IPV6) break; return false; default: return true; } return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, struct nft_regs *regs, const struct xfrm_state *state) { u32 *dest = &regs->data[priv->dreg]; if (!xfrm_state_addr_ok(priv->key, state->props.family, state->props.mode)) { regs->verdict.code = NFT_BREAK; return; } switch (priv->key) { case NFT_XFRM_KEY_UNSPEC: case __NFT_XFRM_KEY_MAX: WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_REQID: *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: *dest = (__force __u32)state->id.spi; return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { regs->verdict.code = NFT_BREAK; return; } state = sp->xvec[priv->spnum]; nft_xfrm_state_get_key(priv, regs, state); } static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct dst_entry *dst = skb_dst(pkt->skb); int i; for (i = 0; dst && dst->xfrm; dst = ((const struct xfrm_dst *)dst)->child, i++) { if (i < priv->spnum) continue; nft_xfrm_state_get_key(priv, regs, dst->xfrm); return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_xfrm *priv = nft_expr_priv(expr); switch (priv->dir) { case XFRM_POLICY_IN: nft_xfrm_get_eval_in(priv, regs, pkt); break; case XFRM_POLICY_OUT: nft_xfrm_get_eval_out(priv, regs, pkt); break; default: WARN_ON_ONCE(1); regs->verdict.code = NFT_BREAK; break; } } static int nft_xfrm_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) return -1; if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) return -1; if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) return -1; if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) return -1; return 0; } static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING); break; case XFRM_POLICY_OUT: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: WARN_ON_ONCE(1); return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_xfrm_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); const struct nft_xfrm *xfrm; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } xfrm = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != xfrm->key || priv->dreg != xfrm->dreg || priv->dir != xfrm->dir || priv->spnum != xfrm->spnum) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), .eval = nft_xfrm_get_eval, .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { .name = "xfrm", .ops = &nft_xfrm_get_ops, .policy = nft_xfrm_policy, .maxattr = NFTA_XFRM_MAX, .owner = THIS_MODULE, }; static int __init nft_xfrm_module_init(void) { return nft_register_expr(&nft_xfrm_type); } static void __exit nft_xfrm_module_exit(void) { nft_unregister_expr(&nft_xfrm_type); } module_init(nft_xfrm_module_init); module_exit(nft_xfrm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); MODULE_ALIAS_NFT_EXPR("xfrm");
4 185 145 10 135 121 93 186 40 159 7 7 9 3 6 123 40 69 17 116 39 215 56 127 128 7 121 1 47 91 2 14 77 9 68 77 77 114 114 114 127 128 128 22 5 22 22 5 22 22 22 22 5 5 5 5 1 1 1 1 1 6 6 1 1 1 1 1 1 50 93 1 93 44 63 18 18 6 6 18 18 18 77 77 77 36 56 77 36 1 7 6 1 1 1 1 1 77 77 25 25 25 5 5 2 5 77 5 5 1 1 77 76 77 77 77 77 5 5 4 3 19 19 19 19 6 6 22 18 5 5 5 131 18 131 18 125 56 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #include "translation-table.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/overflow.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "mesh-interface.h" #include "netlink.h" #include "originator.h" #include "tvlv.h" static struct kmem_cache *batadv_tl_cache __read_mostly; static struct kmem_cache *batadv_tg_cache __read_mostly; static struct kmem_cache *batadv_tt_orig_cache __read_mostly; static struct kmem_cache *batadv_tt_change_cache __read_mostly; static struct kmem_cache *batadv_tt_req_cache __read_mostly; static struct kmem_cache *batadv_tt_roam_cache __read_mostly; /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry); static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming); /** * batadv_compare_tt() - check if two TT entries are the same * @node: the list element pointer of the first TT entry * @data2: pointer to the tt_common_entry of the second TT entry * * Compare the MAC address and the VLAN ID of the two TT entries and check if * they are the same TT client. * Return: true if the two TT clients are the same, false otherwise */ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); const struct batadv_tt_common_entry *tt1 = data1; const struct batadv_tt_common_entry *tt2 = data2; return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2); } /** * batadv_choose_tt() - return the index of the tt entry in the hash table * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) { const struct batadv_tt_common_entry *tt; u32 hash = 0; tt = data; hash = jhash(&tt->addr, ETH_ALEN, hash); hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } /** * batadv_tt_hash_find() - look for a client in the given hash table * @hash: the hash table to search * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, unsigned short vid) { struct hlist_head *head; struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL; u32 index; if (!hash) return NULL; ether_addr_copy(to_search.addr, addr); to_search.vid = vid; index = batadv_choose_tt(&to_search, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(tt, head, hash_entry) { if (!batadv_compare_eth(tt, addr)) continue; if (tt->vid != vid) continue; if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; break; } rcu_read_unlock(); return tt_tmp; } /** * batadv_tt_local_hash_find() - search the local table for a given client * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.local_hash, addr, vid); if (tt_common_entry) tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); return tt_local_entry; } /** * batadv_tt_global_hash_find() - search the global table for a given client * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global_entry = NULL; tt_common_entry = batadv_tt_hash_find(bat_priv->tt.global_hash, addr, vid); if (tt_common_entry) tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return tt_global_entry; } /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node */ static void batadv_tt_local_entry_release(struct kref *ref) { struct batadv_tt_local_entry *tt_local_entry; tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); batadv_meshif_vlan_put(tt_local_entry->vlan); kfree_rcu(tt_local_entry, common.rcu); } /** * batadv_tt_local_entry_put() - decrement the tt_local_entry refcounter and * possibly release it * @tt_local_entry: tt_local_entry to be free'd */ static void batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { if (!tt_local_entry) return; kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); } /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ void batadv_tt_global_entry_release(struct kref *ref) { struct batadv_tt_global_entry *tt_global_entry; tt_global_entry = container_of(ref, struct batadv_tt_global_entry, common.refcount); batadv_tt_global_del_orig_list(tt_global_entry); kfree_rcu(tt_global_entry, common.rcu); } /** * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; int count; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) return 0; count = atomic_read(&tt_global_entry->orig_list_count); batadv_tt_global_entry_put(tt_global_entry); return count; } /** * batadv_tt_local_size_mod() - change the size by v of the local table * identified by vid * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier of the sub-table to change * @v: the amount to sum to the local table size */ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, unsigned short vid, int v) { struct batadv_meshif_vlan *vlan; vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) return; atomic_add(v, &vlan->tt.num_entries); batadv_meshif_vlan_put(vlan); } /** * batadv_tt_local_size_inc() - increase by one the local table size for the * given vid * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, 1); } /** * batadv_tt_local_size_dec() - decrease by one the local table size for the * given vid * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, unsigned short vid) { batadv_tt_local_size_mod(bat_priv, vid, -1); } /** * batadv_tt_global_size_mod() - change the size by v of the global table * for orig_node identified by vid * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, unsigned short vid, int v) { struct batadv_orig_node_vlan *vlan; vlan = batadv_orig_node_vlan_new(orig_node, vid); if (!vlan) return; if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); if (!hlist_unhashed(&vlan->list)) { hlist_del_init_rcu(&vlan->list); batadv_orig_node_vlan_put(vlan); } spin_unlock_bh(&orig_node->vlan_list_lock); } batadv_orig_node_vlan_put(vlan); } /** * batadv_tt_global_size_inc() - increase by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, 1); } /** * batadv_tt_global_size_dec() - decrease by one the global table size for the * given vid * @orig_node: the originator which global table size has to be decreased * @vid: the vlan identifier */ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, unsigned short vid) { batadv_tt_global_size_mod(orig_node, vid, -1); } /** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry */ static void batadv_tt_orig_list_entry_release(struct kref *ref) { struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, refcount); batadv_orig_node_put(orig_entry->orig_node); kfree_rcu(orig_entry, rcu); } /** * batadv_tt_orig_list_entry_put() - decrement the tt orig entry refcounter and * possibly release it * @orig_entry: tt orig entry to be free'd */ static void batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { if (!orig_entry) return; kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** * batadv_tt_local_event() - store a local TT event (ADD/DEL) * @bat_priv: the bat priv with all the mesh interface information * @tt_local_entry: the TT entry involved in the event * @event_flags: flags to store in the event structure */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u8 event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; u8 flags = common->flags | event_flags; bool del_op_requested, del_op_entry; size_t changes; tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) return; tt_change_node->change.flags = flags; memset(tt_change_node->change.reserved, 0, sizeof(tt_change_node->change.reserved)); ether_addr_copy(tt_change_node->change.addr, common->addr); tt_change_node->change.vid = htons(common->vid); del_op_requested = flags & BATADV_TT_CLIENT_DEL; /* check for ADD+DEL, DEL+ADD, ADD+ADD or DEL+DEL events */ spin_lock_bh(&bat_priv->tt.changes_list_lock); changes = READ_ONCE(bat_priv->tt.local_changes); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (!batadv_compare_eth(entry->change.addr, common->addr)) continue; del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL; if (del_op_requested != del_op_entry) { /* DEL+ADD in the same orig interval have no effect and * can be removed to avoid silly behaviour on the * receiver side. The other way around (ADD+DEL) can * happen in case of roaming of a client still in the * NEW state. Roaming of NEW clients is now possible due * to automatically recognition of "temporary" clients */ list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); changes--; } else { /* this is a second add or del in the same originator * interval. It could mean that flags have been changed * (e.g. double add): update them */ entry->change.flags = flags; } kmem_cache_free(batadv_tt_change_cache, tt_change_node); goto update_changes; } /* track the change in the OGMinterval list */ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list); changes++; update_changes: WRITE_ONCE(bat_priv->tt.local_changes, changes); spin_unlock_bh(&bat_priv->tt.changes_list_lock); } /** * batadv_tt_len() - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { return changes_num * sizeof(struct batadv_tvlv_tt_change); } /** * batadv_tt_entries() - compute the number of entries fitting in tt_len bytes * @tt_len: available space * * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { return tt_len / batadv_tt_len(1); } /** * batadv_tt_local_table_transmit_size() - calculates the local translation * table size when transmitted over the air * @bat_priv: the bat priv with all the mesh interface information * * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { u16 num_vlan = 0; u16 tt_local_entries = 0; struct batadv_meshif_vlan *vlan; int hdr_size; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { num_vlan++; tt_local_entries += atomic_read(&vlan->tt.num_entries); } rcu_read_unlock(); /* header size of tvlv encapsulated tt response payload */ hdr_size = sizeof(struct batadv_unicast_tvlv_packet); hdr_size += sizeof(struct batadv_tvlv_hdr); hdr_size += sizeof(struct batadv_tvlv_tt_data); hdr_size += num_vlan * sizeof(struct batadv_tvlv_tt_vlan_data); return hdr_size + batadv_tt_len(tt_local_entries); } static int batadv_tt_local_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.local_hash) return 0; bat_priv->tt.local_hash = batadv_hash_new(1024); if (!bat_priv->tt.local_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.local_hash, &batadv_tt_local_hash_lock_class_key); return 0; } static void batadv_tt_global_free(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global, const char *message) { struct batadv_tt_global_entry *tt_removed_entry; struct hlist_node *tt_removed_node; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), message); tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); if (!tt_removed_node) return; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_global_entry, common.hash_entry); batadv_tt_global_entry_put(tt_removed_entry); } /** * batadv_tt_local_add() - add a new client to the local table or update an * existing client * @mesh_iface: netdev struct of the mesh interface * @addr: the mac address of the client to add * @vid: VLAN identifier * @ifindex: index of the interface where the client is connected to (useful to * identify wireless clients) * @mark: the value contained in the skb->mark field of the received packet (if * any) * * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; struct net *net = dev_net(mesh_iface); struct batadv_meshif_vlan *vlan; struct net_device *in_dev = NULL; struct batadv_hard_iface *in_hardif = NULL; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; bool ret = false; bool roamed_back = false; u8 remote_flags; u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(net, ifindex); if (in_dev) in_hardif = batadv_hardif_get_by_netdev(in_dev); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!is_multicast_ether_addr(addr)) tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (tt_local) { tt_local->last_seen = jiffies; if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Re-adding pending client %pM (vid: %d)\n", addr, batadv_print_vid(vid)); /* whatever the reason why the PENDING flag was set, * this is a client which was enqueued to be removed in * this orig_interval. Since it popped up again, the * flag can be reset like it was never enqueued */ tt_local->common.flags &= ~BATADV_TT_CLIENT_PENDING; goto add_event; } if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Roaming client %pM (vid: %d) came back to its original location\n", addr, batadv_print_vid(vid)); /* the ROAM flag is set because this client roamed away * and the node got a roaming_advertisement message. Now * that the client popped up again at its original * location such flag can be unset */ tt_local->common.flags &= ~BATADV_TT_CLIENT_ROAM; roamed_back = true; } goto check_roaming; } /* Ignore the client if we cannot send it in a full table response. */ table_size = batadv_tt_local_table_transmit_size(bat_priv); table_size += batadv_tt_len(1); packet_size_max = atomic_read(&bat_priv->packet_size_max); if (table_size > packet_size_max) { net_ratelimited_function(batadv_info, mesh_iface, "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n", table_size, packet_size_max, addr); goto out; } tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC); if (!tt_local) goto out; /* increase the refcounter of the related vlan */ vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) { net_ratelimited_function(batadv_info, mesh_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, batadv_print_vid(vid)); kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; } batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, batadv_print_vid(vid), (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in * a full table response going out before the next ttvn increment * (consistency check) */ tt_local->common.flags = BATADV_TT_CLIENT_NEW; tt_local->common.vid = vid; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; kref_init(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged */ if (batadv_compare_eth(addr, mesh_iface->dev_addr) || is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; kref_get(&tt_local->common.refcount); hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local->common, &tt_local->common.hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_put(tt_local); goto out; } add_event: batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); check_roaming: /* Check whether it is a roaming, but don't do anything if the roaming * process has already been handled */ if (tt_global && !(tt_global->common.flags & BATADV_TT_CLIENT_ROAM)) { /* These node are probably going to update their tt table */ head = &tt_global->orig_list; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, head, list) { batadv_send_roam_adv(bat_priv, tt_global->common.addr, tt_global->common.vid, orig_entry->orig_node); } rcu_read_unlock(); if (roamed_back) { batadv_tt_global_free(bat_priv, tt_global, "Roaming canceled"); } else { /* The global entry has to be marked as ROAMING and * has to be kept for consistency purpose */ tt_global->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global->roam_at = jiffies; } } /* store the current remote flags before altering them. This helps * understanding is flags are changing or not */ remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK; if (batadv_is_wifi_hardif(in_hardif)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; else tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI; /* check the mark in the skb: if it's equal to the configured * isolation_mark, it means the packet is coming from an isolated * non-mesh client */ match_mark = (mark & bat_priv->isolation_mark_mask); if (bat_priv->isolation_mark_mask && match_mark == bat_priv->isolation_mark) tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA; else tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA; /* if any "dynamic" flag has been modified, resend an ADD event for this * entry so that all the nodes can get the new flags */ if (remote_flags ^ (tt_local->common.flags & BATADV_TT_REMOTE_MASK)) batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); ret = true; out: batadv_hardif_put(in_hardif); dev_put(in_dev); batadv_tt_local_entry_put(tt_local); batadv_tt_global_entry_put(tt_global); return ret; } /** * batadv_tt_prepare_tvlv_global_data() - prepare the TVLV TT header to send * within a TT Response directed to another node * @orig_node: originator for which the TT data has to be prepared * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changed can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { u16 num_vlan = 0; u16 num_entries = 0; u16 change_offset; u16 tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); } change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(num_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { *tt_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&orig_node->vlan_list_lock); return tvlv_len; } /** * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for * this node * @bat_priv: the bat priv with all the mesh interface information * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changes can be stored * @tt_len: pointer to the length to reserve to the tt_change. if -1 this * function reserves the amount of space needed to send the entire local TT * table. In case of success the value is updated with the real amount of * reserved bytes * * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_meshif_vlan *vlan; u16 num_vlan = 0; u16 vlan_entries = 0; u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; int change_offset; spin_lock_bh(&bat_priv->meshif_vlan_list_lock); hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; num_vlan++; total_entries += vlan_entries; } change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) *tt_len = batadv_tt_len(total_entries); tvlv_len = *tt_len; tvlv_len += change_offset; *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { tvlv_len = 0; goto out; } (*tt_data)->flags = BATADV_NO_FLAGS; (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn); (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; } tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return tvlv_len; } /** * batadv_tt_tvlv_container_update() - update the translation table tvlv * container after local tt changes have been committed * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; struct batadv_tvlv_tt_data *tt_data; struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0; int tt_diff_entries_count = 0; bool drop_changes = false; size_t tt_extra_len = 0; u16 tvlv_len; tt_diff_entries_num = READ_ONCE(bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); /* if we have too many changes for one packet don't send any * and wait for the tt table request so we can reply with the full * (fragmented) table. * * The local change history should still be cleaned up so the next * TT round can start again with a clean state. */ if (tt_diff_len > bat_priv->mesh_iface->mtu) { tt_diff_len = 0; tt_diff_entries_num = 0; drop_changes = true; } tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data, &tt_change, &tt_diff_len); if (!tvlv_len) return; tt_data->flags = BATADV_TT_OGM_DIFF; if (!drop_changes && tt_diff_len == 0) goto container_register; spin_lock_bh(&bat_priv->tt.changes_list_lock); WRITE_ONCE(bat_priv->tt.local_changes, 0); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (tt_diff_entries_count < tt_diff_entries_num) { memcpy(tt_change + tt_diff_entries_count, &entry->change, sizeof(struct batadv_tvlv_tt_change)); tt_diff_entries_count++; } list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } spin_unlock_bh(&bat_priv->tt.changes_list_lock); tt_extra_len = batadv_tt_len(tt_diff_entries_num - tt_diff_entries_count); /* Keep the buffer for possible tt_request */ spin_lock_bh(&bat_priv->tt.last_changeset_lock); kfree(bat_priv->tt.last_changeset); bat_priv->tt.last_changeset_len = 0; bat_priv->tt.last_changeset = NULL; tt_change_len = batadv_tt_len(tt_diff_entries_count); /* check whether this new OGM has no changes due to size problems */ if (tt_diff_entries_count > 0) { tt_diff_len -= tt_extra_len; /* if kmalloc() fails we will reply with the full table * instead of providing the diff */ bat_priv->tt.last_changeset = kzalloc(tt_diff_len, GFP_ATOMIC); if (bat_priv->tt.last_changeset) { memcpy(bat_priv->tt.last_changeset, tt_change, tt_change_len); bat_priv->tt.last_changeset_len = tt_diff_len; } } spin_unlock_bh(&bat_priv->tt.last_changeset_lock); /* Remove extra packet space for OGM */ tvlv_len -= tt_extra_len; container_register: batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data, tvlv_len); kfree(tt_data); } /** * batadv_tt_local_dump_entry() - Dump one TT local entry into a message * @msg :Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common) { void *hdr; struct batadv_meshif_vlan *vlan; struct batadv_tt_local_entry *local; unsigned int last_seen_msecs; u32 crc; local = container_of(common, struct batadv_tt_local_entry, common); last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); vlan = batadv_meshif_vlan_get(bat_priv, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_meshif_vlan_put(vlan); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_LOCAL); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) goto nla_put_failure; if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information * @hash: hash to dump * @bucket: bucket index to dump * @idx_s: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hashtable *hash, unsigned int bucket, int *idx_s) { struct batadv_tt_common_entry *common; int idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(common, &hash->table[bucket], hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv, common)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = idx - 1; return -EMSGSIZE; } } spin_unlock_bh(&hash->list_locks[bucket]); *idx_s = 0; return 0; } /** * batadv_tt_local_dump() - Dump TT local entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or 0 on success */ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net_device *mesh_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; int ret; int bucket = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.local_hash; while (bucket < hash->size) { if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv, hash, bucket, &idx)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); cb->args[0] = bucket; cb->args[1] = idx; return ret; } static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, u16 flags, const char *message) { batadv_tt_local_event(bat_priv, tt_local_entry, flags); /* The local client has to be marked as "pending to be removed" but has * to be kept in the table in order to send it in a full table * response issued before the net ttvn increment (consistency check) */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_PENDING; batadv_dbg(BATADV_DBG_TT, bat_priv, "Local tt entry (%pM, vid: %d) pending to be removed: %s\n", tt_local_entry->common.addr, batadv_print_vid(tt_local_entry->common.vid), message); } /** * batadv_tt_local_remove() - logically remove an entry from the local table * @bat_priv: the bat priv with all the mesh interface information * @addr: the MAC address of the client to remove * @vid: VLAN identifier * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_local_entry *tt_removed_entry; struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; struct hlist_node *tt_removed_node; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; curr_flags = tt_local_entry->common.flags; flags = BATADV_TT_CLIENT_DEL; /* if this global entry addition is due to a roaming, the node has to * mark the local entry as "roamed" in order to correctly reroute * packets later */ if (roaming) { flags |= BATADV_TT_CLIENT_ROAM; /* mark the local client as ROAMed */ tt_local_entry->common.flags |= BATADV_TT_CLIENT_ROAM; } if (!(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) { batadv_tt_local_set_pending(bat_priv, tt_local_entry, flags, message); goto out; } /* if this client has been added right now, it is possible to * immediately purge it */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local_entry->common); if (!tt_removed_node) goto out; /* drop reference of remove hash entry */ tt_removed_entry = hlist_entry(tt_removed_node, struct batadv_tt_local_entry, common.hash_entry); batadv_tt_local_entry_put(tt_removed_entry); out: batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } /** * batadv_tt_local_purge_list() - purge inactive tt local entries * @bat_priv: the bat priv with all the mesh interface information * @head: pointer to the list containing the local tt entries * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, struct hlist_head *head, int timeout) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_common_entry *tt_common_entry; struct hlist_node *node_tmp; hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { tt_local_entry = container_of(tt_common_entry, struct batadv_tt_local_entry, common); if (tt_local_entry->common.flags & BATADV_TT_CLIENT_NOPURGE) continue; /* entry already marked for deletion */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) continue; if (!batadv_has_timed_out(tt_local_entry->last_seen, timeout)) continue; batadv_tt_local_set_pending(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL, "timed out"); } } /** * batadv_tt_local_purge() - purge inactive tt local entries * @bat_priv: the bat priv with all the mesh interface information * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ static void batadv_tt_local_purge(struct batadv_priv *bat_priv, int timeout) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); batadv_tt_local_purge_list(bat_priv, head, timeout); spin_unlock_bh(list_lock); } } static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.local_hash) return; hash = bat_priv->tt.local_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.local_hash = NULL; } static int batadv_tt_global_init(struct batadv_priv *bat_priv) { if (bat_priv->tt.global_hash) return 0; bat_priv->tt.global_hash = batadv_hash_new(1024); if (!bat_priv->tt.global_hash) return -ENOMEM; batadv_hash_set_lock_class(bat_priv->tt.global_hash, &batadv_tt_global_hash_lock_class_key); return 0; } static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_change_node *entry, *safe; spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { list_del(&entry->list); kmem_cache_free(batadv_tt_change_cache, entry); } WRITE_ONCE(bat_priv->tt.local_changes, 0); spin_unlock_bh(&bat_priv->tt.changes_list_lock); } /** * batadv_tt_global_orig_entry_find() - find a TT orig_list_entry * @entry: the TT global entry where the orig_list_entry has to be * extracted from * @orig_node: the originator for which the orig_list_entry has to be found * * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node) { struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL; const struct hlist_head *head; rcu_read_lock(); head = &entry->orig_list; hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; break; } rcu_read_unlock(); return orig_entry; } /** * batadv_tt_global_entry_has_orig() - check if a TT global entry is also * handled by a given originator * @entry: the TT global entry to check * @orig_node: the originator to search in the list * @flags: a pointer to store TT flags for the given @entry received * from @orig_node * * find out if an orig_node is already in the list of a tt_global_entry. * * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, const struct batadv_orig_node *orig_node, u8 *flags) { struct batadv_tt_orig_list_entry *orig_entry; bool found = false; orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; if (flags) *flags = orig_entry->flags; batadv_tt_orig_list_entry_put(orig_entry); } return found; } /** * batadv_tt_global_sync_flags() - update TT sync flags * @tt_global: the TT global entry to update sync flags in * * Updates the sync flag bits in the tt_global flag attribute with a logical * OR of all sync flags from any of its TT orig entries. */ static void batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) { struct batadv_tt_orig_list_entry *orig_entry; const struct hlist_head *head; u16 flags = BATADV_NO_FLAGS; rcu_read_lock(); head = &tt_global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) flags |= orig_entry->flags; rcu_read_unlock(); flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); tt_global->common.flags = flags; } /** * batadv_tt_global_orig_entry_add() - add or update a TT orig entry * @tt_global: the TT global entry to add an orig entry in * @orig_node: the originator to add an orig entry for * @ttvn: translation table version number of this changeset * @flags: TT sync flags */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, struct batadv_orig_node *orig_node, int ttvn, u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global->list_lock); orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; orig_entry->flags = flags; goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); if (!orig_entry) goto out; INIT_HLIST_NODE(&orig_entry->list); kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; orig_entry->flags = flags; kref_init(&orig_entry->refcount); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); atomic_inc(&tt_global->orig_list_count); sync_flags: batadv_tt_global_sync_flags(tt_global); out: batadv_tt_orig_list_entry_put(orig_entry); spin_unlock_bh(&tt_global->list_lock); } /** * batadv_tt_global_add() - add a new TT global entry or update an existing one * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the originator announcing the client * @tt_addr: the mac address of the non-mesh client * @vid: VLAN identifier * @flags: TT flags that have to be set for this non-mesh client * @ttvn: the tt version number ever announcing this non-mesh client * * Add a new TT global entry for the given originator. If the entry already * exists add a new reference to the given originator (a global entry can have * references to multiple originators) and adjust the flags attribute to reflect * the function argument. * If a TT local entry exists for this non-mesh client remove it. * * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *tt_addr, unsigned short vid, u16 flags, u8 ttvn) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *tt_local_entry; bool ret = false; int hash_added; struct batadv_tt_common_entry *common; u16 local_flags; /* ignore global entries from backbone nodes */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) return true; tt_global_entry = batadv_tt_global_hash_find(bat_priv, tt_addr, vid); tt_local_entry = batadv_tt_local_hash_find(bat_priv, tt_addr, vid); /* if the node already has a local client for this entry, it has to wait * for a roaming advertisement instead of manually messing up the global * table */ if ((flags & BATADV_TT_CLIENT_TEMP) && tt_local_entry && !(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) goto out; if (!tt_global_entry) { tt_global_entry = kmem_cache_zalloc(batadv_tg_cache, GFP_ATOMIC); if (!tt_global_entry) goto out; common = &tt_global_entry->common; ether_addr_copy(common->addr, tt_addr); common->vid = vid; if (!is_multicast_ether_addr(common->addr)) common->flags = flags & (~BATADV_TT_SYNC_MASK); tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims * it) */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; kref_init(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); kref_get(&common->refcount); hash_added = batadv_hash_add(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, common, &common->hash_entry); if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { common = &tt_global_entry->common; /* If there is already a global entry, we can use this one for * our processing. * But if we are trying to add a temporary client then here are * two options at this point: * 1) the global client is not a temporary client: the global * client has to be left as it is, temporary information * should never override any already known client state * 2) the global client is a temporary client: purge the * originator list and add the new one orig_entry */ if (flags & BATADV_TT_CLIENT_TEMP) { if (!(common->flags & BATADV_TT_CLIENT_TEMP)) goto out; if (batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, NULL)) goto out_remove; batadv_tt_global_del_orig_list(tt_global_entry); goto add_orig_entry; } /* if the client was temporary added before receiving the first * OGM announcing it, we have to clear the TEMP flag. Also, * remove the previous temporary orig node and re-add it * if required. If the orig entry changed, the new one which * is a non-temporary entry is preferred. */ if (common->flags & BATADV_TT_CLIENT_TEMP) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_TEMP; } /* the change can carry possible "attribute" flags like the * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ if (!is_multicast_ether_addr(common->addr)) common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a * delete + roaming change for this originator. * * We should first delete the old originator before adding the * new one. */ if (common->flags & BATADV_TT_CLIENT_ROAM) { batadv_tt_global_del_orig_list(tt_global_entry); common->flags &= ~BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = 0; } } add_orig_entry: /* add the new orig_entry (if needed) or update it */ batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", common->addr, batadv_print_vid(common->vid), orig_node->orig); ret = true; out_remove: /* Do not remove multicast addresses from the local hash on * global additions */ if (is_multicast_ether_addr(tt_addr)) goto out; /* remove address from local hash if present */ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid, "global tt received", flags & BATADV_TT_CLIENT_ROAM); tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI; if (!(flags & BATADV_TT_CLIENT_ROAM)) /* this is a normal global add. Therefore the client is not in a * roaming state anymore. */ tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM; out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_transtable_best_orig() - Get best originator list entry from tt entry * @bat_priv: the bat priv with all the mesh interface information * @tt_global_entry: global translation table entry to be analyzed * * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry) { struct batadv_neigh_node *router, *best_router = NULL; struct batadv_algo_ops *bao = bat_priv->algo_ops; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL; head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { router = batadv_orig_router_get(orig_entry->orig_node, BATADV_IF_DEFAULT); if (!router) continue; if (best_router && bao->neigh.cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } batadv_neigh_node_put(best_router); return best_entry; } /** * batadv_tt_global_dump_subentry() - Dump all TT local entries into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @common: tt local & tt global common data * @orig: Originator node announcing a non-mesh client * @best: Is the best originator for the TT entry * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_common_entry *common, struct batadv_tt_orig_list_entry *orig, bool best) { u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; u32 crc; vlan = batadv_orig_node_vlan_get(orig->orig_node, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_TRANSTABLE_GLOBAL); if (!hdr) return -ENOBUFS; last_ttvn = atomic_read(&orig->orig_node->last_ttvn); if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_tt_global_dump_entry() - Dump one TT global entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @common: tt local & tt global common data * @sub_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_tt_common_entry *common, int *sub_s) { struct batadv_tt_orig_list_entry *orig_entry, *best_entry; struct batadv_tt_global_entry *global; struct hlist_head *head; int sub = 0; bool best; global = container_of(common, struct batadv_tt_global_entry, common); best_entry = batadv_transtable_best_orig(bat_priv, global); head = &global->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (sub++ < *sub_s) continue; best = (orig_entry == best_entry); if (batadv_tt_global_dump_subentry(msg, portid, seq, common, orig_entry, best)) { *sub_s = sub - 1; return -EMSGSIZE; } } *sub_s = 0; return 0; } /** * batadv_tt_global_dump_bucket() - Dump one TT local bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @head: Pointer to the list containing the global tt entries * @idx_s: Number of entries to skip * @sub: Number of entries to skip * * Return: Error code, or 0 on success */ static int batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_tt_common_entry *common; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(common, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, common, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_tt_global_dump() - Dump TT global entries into a message * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: Error code, or length of message on success */ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct net_device *mesh_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; struct hlist_head *head; int ret; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } hash = bat_priv->tt.global_hash; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_tt_global_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, head, &idx, &sub)) break; bucket++; } ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; return ret; } /** * _batadv_tt_global_del_orig_entry() - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * * Remove an orig_entry from its list in the given tt_global_entry and * free this orig_entry afterwards. * * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is * part of a list. */ static void _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, struct batadv_tt_orig_list_entry *orig_entry) { lockdep_assert_held(&tt_global_entry->list_lock); batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); /* requires holding tt_global_entry->list_lock and orig_entry->list * being part of a list */ hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ static void batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } /** * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry * @bat_priv: the bat priv with all the mesh interface information * @tt_global_entry: the global entry to remove the orig_node from * @orig_node: the originator announcing the client * @message: message to append to the log on deletion * * Remove the given orig_node and its according orig_entry from the given * global tt entry. */ static void batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { struct hlist_head *head; struct hlist_node *safe; struct batadv_tt_orig_list_entry *orig_entry; unsigned short vid; spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) { if (orig_entry->orig_node == orig_node) { vid = tt_global_entry->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting %pM from global tt entry %pM (vid: %d): %s\n", orig_node->orig, tt_global_entry->common.addr, batadv_print_vid(vid), message); _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); } /* If the client is to be deleted, we check if it is the last origantor entry * within tt_global entry. If yes, we set the BATADV_TT_CLIENT_ROAM flag and the * timer, otherwise we simply remove the originator scheduled for deletion. */ static void batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global_entry, struct batadv_orig_node *orig_node, const char *message) { bool last_entry = true; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; /* no local entry exists, case 1: * Check if this is the last one or if other entries exist. */ rcu_read_lock(); head = &tt_global_entry->orig_list; hlist_for_each_entry_rcu(orig_entry, head, list) { if (orig_entry->orig_node != orig_node) { last_entry = false; break; } } rcu_read_unlock(); if (last_entry) { /* its the last one, mark for roaming. */ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = jiffies; } else { /* there is another entry, we can simply delete this * one and can still use the other one. */ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); } } /** * batadv_tt_global_del() - remove a client from the global table * @bat_priv: the bat priv with all the mesh interface information * @orig_node: an originator serving this client * @addr: the mac address of the client * @vid: VLAN identifier * @message: a message explaining the reason for deleting the client to print * for debugging purpose * @roaming: true if the deletion has been triggered by a roaming event */ static void batadv_tt_global_del(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid, const char *message, bool roaming) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *local_entry = NULL; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; if (!roaming) { batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); if (hlist_empty(&tt_global_entry->orig_list)) batadv_tt_global_free(bat_priv, tt_global_entry, message); goto out; } /* if we are deleting a global entry due to a roam * event, there are two possibilities: * 1) the client roamed from node A to node B => if there * is only one originator left for this client, we mark * it with BATADV_TT_CLIENT_ROAM, we start a timer and we * wait for node B to claim it. In case of timeout * the entry is purged. * * If there are other originators left, we directly delete * the originator. * 2) the client roamed to us => we can directly delete * the global entry, since it is useless now. */ local_entry = batadv_tt_local_hash_find(bat_priv, tt_global_entry->common.addr, vid); if (local_entry) { /* local entry exists, case 2: client roamed to us. */ batadv_tt_global_del_orig_list(tt_global_entry); batadv_tt_global_free(bat_priv, tt_global_entry, message); } else { /* no local entry exists, case 1: check for roaming */ batadv_tt_global_del_roaming(bat_priv, tt_global_entry, orig_node, message); } out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(local_entry); } /** * batadv_tt_global_del_orig() - remove all the TT global entries belonging to * the given originator matching the provided vid * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the originator owning the entries to remove * @match_vid: the VLAN identifier to match. If negative all the entries will be * removed * @message: debug message to print as "reason" */ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message) { struct batadv_tt_global_entry *tt_global; struct batadv_tt_common_entry *tt_common_entry; u32 i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_node *safe; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ unsigned short vid; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, safe, head, hash_entry) { /* remove only matching entries */ if (match_vid >= 0 && tt_common_entry->vid != match_vid) continue; tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_del_orig_node(bat_priv, tt_global, orig_node, message); if (hlist_empty(&tt_global->orig_list)) { vid = tt_global->common.vid; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); } clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, char **msg) { bool purge = false; unsigned long roam_timeout = BATADV_TT_CLIENT_ROAM_TIMEOUT; unsigned long temp_timeout = BATADV_TT_CLIENT_TEMP_TIMEOUT; if ((tt_global->common.flags & BATADV_TT_CLIENT_ROAM) && batadv_has_timed_out(tt_global->roam_at, roam_timeout)) { purge = true; *msg = "Roaming timeout\n"; } if ((tt_global->common.flags & BATADV_TT_CLIENT_TEMP) && batadv_has_timed_out(tt_global->common.added_at, temp_timeout)) { purge = true; *msg = "Temporary client timeout\n"; } return purge; } static void batadv_tt_global_purge(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_head *head; struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; char *msg = NULL; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); if (!batadv_tt_global_to_purge(tt_global, &msg)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), msg); hlist_del_rcu(&tt_common->hash_entry); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } } static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_global_entry *tt_global; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; if (!bat_priv->tt.global_hash) return; hash = bat_priv->tt.global_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common_entry, node_tmp, head, hash_entry) { hlist_del_rcu(&tt_common_entry->hash_entry); tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } batadv_hash_destroy(hash); bat_priv->tt.global_hash = NULL; } static bool _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, struct batadv_tt_global_entry *tt_global_entry) { if (tt_local_entry->common.flags & BATADV_TT_CLIENT_WIFI && tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI) return true; /* check if the two clients are marked as isolated */ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA && tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA) return true; return false; } /** * batadv_transtable_search() - get the mesh destination for a given client * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of the source client * @addr: mac address of the destination client * @vid: VLAN identifier * * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). * * If the two clients are AP isolated the function returns NULL. */ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; struct batadv_tt_global_entry *tt_global_entry = NULL; struct batadv_orig_node *orig_node = NULL; struct batadv_tt_orig_list_entry *best_entry; if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) { tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid); if (!tt_local_entry || (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)) goto out; } tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; /* check whether the clients should not communicate due to AP * isolation */ if (tt_local_entry && _batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) goto out; rcu_read_lock(); best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry); /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); out: batadv_tt_global_entry_put(tt_global_entry); batadv_tt_local_entry_put(tt_local_entry); return orig_node; } /** * batadv_tt_global_crc() - calculates the checksum of the local table belonging * to the given orig_node * @bat_priv: the bat priv with all the mesh interface information * @orig_node: originator for which the CRC should be computed * @vid: VLAN identifier for which the CRC32 has to be computed * * This function computes the checksum for the global table corresponding to a * specific originator. In particular, the checksum is computed as follows: For * each client connected to the originator the CRC32C of the MAC address and the * VID is computed and then all the CRC32Cs of the various clients are xor'ed * together. * * The idea behind is that CRC32C should be used as much as possible in order to * produce a unique hash of the table, but since the order which is used to feed * the CRC32C function affects the result and since every node in the network * probably sorts the clients differently, the hash function cannot be directly * computed over the entire table. Hence the CRC32C is used only on * the single client entry, while all the results are then xor'ed together * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { tt_global = container_of(tt_common, struct batadv_tt_global_entry, common); /* compute the CRC only for entries belonging to the * VLAN identified by the vid passed as parameter */ if (tt_common->vid != vid) continue; /* Roaming clients are in the global table for * consistency only. They don't have to be * taken into account while computing the * global crc */ if (tt_common->flags & BATADV_TT_CLIENT_ROAM) continue; /* Temporary clients have not been announced yet, so * they have to be skipped while computing the global * crc */ if (tt_common->flags & BATADV_TT_CLIENT_TEMP) continue; /* find out if this global entry is announced by this * originator */ tt_orig = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (!tt_orig) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } return crc; } /** * batadv_tt_local_crc() - calculates the checksum of the local table * @bat_priv: the bat priv with all the mesh interface information * @vid: VLAN identifier for which the CRC32 has to be computed * * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct hlist_head *head; u32 i, crc_tmp, crc = 0; u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common, head, hash_entry) { /* compute the CRC only for entries belonging to the * VLAN identified by vid */ if (tt_common->vid != vid) continue; /* not yet committed clients have not to be taken into * account while computing the CRC */ if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; /* use network order to read the VID: this ensures that * every node reads the bytes in the same order. */ tmp_vid = htons(tt_common->vid); crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes */ flags = tt_common->flags & BATADV_TT_SYNC_MASK; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); } rcu_read_unlock(); } return crc; } /** * batadv_tt_req_node_release() - free tt_req node entry * @ref: kref pointer of the tt req_node entry */ static void batadv_tt_req_node_release(struct kref *ref) { struct batadv_tt_req_node *tt_req_node; tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); kmem_cache_free(batadv_tt_req_cache, tt_req_node); } /** * batadv_tt_req_node_put() - decrement the tt_req_node refcounter and * possibly release it * @tt_req_node: tt_req_node to be free'd */ static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) { if (!tt_req_node) return; kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); } static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_buff_len) { /* Replace the old buffer only if I received something in the * last OGM (the OGM could carry no changes) */ spin_lock_bh(&orig_node->tt_buff_lock); if (tt_buff_len > 0) { kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC); if (orig_node->tt_buff) { memcpy(orig_node->tt_buff, tt_buff, tt_buff_len); orig_node->tt_buff_len = tt_buff_len; } } spin_unlock_bh(&orig_node->tt_buff_lock); } static void batadv_tt_req_purge(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { hlist_del_init(&node->list); batadv_tt_req_node_put(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); } /** * batadv_tt_req_node_new() - search and possibly create a tt_req_node object * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node this request is being issued for * * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { if (batadv_compare_eth(tt_req_node_tmp, orig_node) && !batadv_has_timed_out(tt_req_node_tmp->issued_at, BATADV_TT_REQUEST_TIMEOUT)) goto unlock; } tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC); if (!tt_req_node) goto unlock; kref_init(&tt_req_node->refcount); ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; kref_get(&tt_req_node->refcount); hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); return tt_req_node; } /** * batadv_tt_local_valid() - verify local tt entry and get flags * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given local TT entry. If it is, then the provided * flags pointer is updated. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) return false; if (flags) *flags = tt_common_entry->flags; return true; } /** * batadv_tt_global_valid() - verify global tt entry and get flags * @entry_ptr: to be checked global tt entry * @data_ptr: an orig_node object (may be NULL) * @flags: a pointer to store TT flags for this client to * * Checks the validity of the given global TT entry. If it is, then the provided * flags pointer is updated either with the common (summed) TT flags if data_ptr * is NULL or the specific, per originator TT flags otherwise. * * Return: true if the entry is a valid, false otherwise. */ static bool batadv_tt_global_valid(const void *entry_ptr, const void *data_ptr, u8 *flags) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; const struct batadv_orig_node *orig_node = data_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM || tt_common_entry->flags & BATADV_TT_CLIENT_TEMP) return false; tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, common); return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node, flags); } /** * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the * specified tt hash * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the tt entries * @tt_len: expected tvlv tt data buffer length in number of bytes * @tvlv_buff: pointer to the buffer to fill with the TT data * @valid_cb: function to filter tt change entries and to return TT flags * @cb_data: data passed to the filter function as argument * * Fills the tvlv buff with the tt entries from the specified hash. If valid_cb * is not provided then this becomes a no-op. * * Return: Remaining unused length in tvlv_buff. */ static u16 batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, bool (*valid_cb)(const void *, const void *, u8 *flags), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; u16 tt_tot, tt_num_entries = 0; u8 flags; bool ret; u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = tvlv_buff; if (!valid_cb) return tt_len; rcu_read_lock(); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (tt_tot == tt_num_entries) break; ret = valid_cb(tt_common_entry, cb_data, &flags); if (!ret) continue; ether_addr_copy(tt_change->addr, tt_common_entry->addr); tt_change->flags = flags; tt_change->vid = htons(tt_common_entry->vid); memset(tt_change->reserved, 0, sizeof(tt_change->reserved)); tt_num_entries++; tt_change++; } } rcu_read_unlock(); return batadv_tt_len(tt_tot - tt_num_entries); } /** * batadv_tt_global_check_crc() - check if all the CRCs are correct * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries * * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan) { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; int i, orig_num_vlan; u32 crc; /* check if each received CRC matches the locally stored one */ for (i = 0; i < num_vlan; i++) { tt_vlan_tmp = tt_vlan + i; /* if orig_node is a backbone node for this VLAN, don't check * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(orig_node->bat_priv, orig_node->orig, ntohs(tt_vlan_tmp->vid))) continue; vlan = batadv_orig_node_vlan_get(orig_node, ntohs(tt_vlan_tmp->vid)); if (!vlan) return false; crc = vlan->tt.crc; batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; } /* check if any excess VLANs exist locally for the originator * which are not mentioned in the TVLV from the originator. */ rcu_read_lock(); orig_num_vlan = 0; hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) orig_num_vlan++; rcu_read_unlock(); if (orig_num_vlan > num_vlan) return false; return true; } /** * batadv_tt_local_update_crc() - update all the local CRCs * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) { struct batadv_meshif_vlan *vlan; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid); } rcu_read_unlock(); } /** * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig_node for which the CRCs have to be updated */ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node_vlan *vlan; u32 crc; /* recompute the global CRC for each VLAN */ rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { /* if orig_node is a backbone node for this VLAN, don't compute * the CRC as we ignore all the global entries over it */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vlan->vid)) continue; crc = batadv_tt_global_crc(bat_priv, orig_node, vlan->vid); vlan->tt.crc = crc; } rcu_read_unlock(); } /** * batadv_send_tt_request() - send a TT Request message to a given node * @bat_priv: the bat priv with all the mesh interface information * @dst_orig_node: the destination of the message * @ttvn: the version number that the source of the message is looking for * @tt_vlan: pointer to the first tvlv VLAN object to request * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise * * Return: true if the TT Request was sent, false otherwise */ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, u8 ttvn, struct batadv_tvlv_tt_vlan_data *tt_vlan, u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; struct batadv_hard_iface *primary_if; bool ret = false; int i, size; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node); if (!tt_req_node) goto out; size = struct_size(tvlv_tt_data, vlan_data, num_vlan); tvlv_tt_data = kzalloc(size, GFP_ATOMIC); if (!tvlv_tt_data) goto out; tvlv_tt_data->flags = BATADV_TT_REQUEST; tvlv_tt_data->ttvn = ttvn; tvlv_tt_data->num_vlan = htons(num_vlan); /* send all the CRCs within the request. This is needed by intermediate * nodes to ensure they have the correct table before replying */ for (i = 0; i < num_vlan; i++) { tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid; tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc; tt_vlan++; } if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_REQUEST to %pM [%c]\n", dst_orig_node->orig, full_table ? 'F' : '.'); batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, dst_orig_node->orig, BATADV_TVLV_TT, 1, tvlv_tt_data, size); ret = true; out: batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); if (!hlist_unhashed(&tt_req_node->list)) { hlist_del_init(&tt_req_node->list); batadv_tt_req_node_put(tt_req_node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); } batadv_tt_req_node_put(tt_req_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_other_tt_response() - send reply to tt request concerning another * node's translation table * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; bool ret = false, full_table; u8 orig_ttvn, req_ttvn; u16 tvlv_len; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); if (!req_dst_orig_node) goto out; res_dst_orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!res_dst_orig_node) goto out; orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; /* this node doesn't have the requested data */ if (orig_ttvn != req_ttvn || !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data, ntohs(tt_data->num_vlan))) goto out; /* If the full table has been explicitly requested */ if (tt_data->flags & BATADV_TT_FULL_TABLE || !req_dst_orig_node->tt_buff) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&req_dst_orig_node->tt_buff_lock); tt_len = req_dst_orig_node->tt_buff_len; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, req_dst_orig_node->tt_buff, req_dst_orig_node->tt_buff_len); spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); } else { /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len) goto out; /* fill the rest of the tvlv with the real TT entries */ tvlv_len -= batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash, tt_change, tt_len, batadv_tt_global_valid, req_dst_orig_node); } /* Don't send the response, if larger than fragmented packet. */ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len; if (tt_len > atomic_read(&bat_priv->packet_size_max)) { net_ratelimited_function(batadv_info, bat_priv->mesh_iface, "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n", res_dst_orig_node->orig); goto out; } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE %pM for %pM [%c] (ttvn: %u)\n", res_dst_orig_node->orig, req_dst_orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, req_dst_orig_node->orig, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); ret = true; goto out; unlock: spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); out: batadv_orig_node_put(res_dst_orig_node); batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } /** * batadv_send_my_tt_response() - send reply to tt request concerning this * node's translation table * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_hard_iface *primary_if = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_orig_node *orig_node; u8 my_ttvn, req_ttvn; u16 tvlv_len; bool full_table; s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); my_ttvn = (u8)atomic_read(&bat_priv->tt.vn); req_ttvn = tt_data->ttvn; orig_node = batadv_orig_hash_find(bat_priv, req_src); if (!orig_node) goto out; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* If the full table has been explicitly requested or the gap * is too big send the whole local translation table */ if (tt_data->flags & BATADV_TT_FULL_TABLE || my_ttvn != req_ttvn || !bat_priv->tt.last_changeset) full_table = true; else full_table = false; /* TT fragmentation hasn't been implemented yet, so send as many * TT entries fit a single packet as possible only */ if (!full_table) { spin_lock_bh(&bat_priv->tt.last_changeset_lock); tt_len = bat_priv->tt.last_changeset_len; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto unlock; /* Copy the last orig_node's OGM buffer */ memcpy(tt_change, bat_priv->tt.last_changeset, bat_priv->tt.last_changeset_len); spin_unlock_bh(&bat_priv->tt.last_changeset_lock); } else { req_ttvn = (u8)atomic_read(&bat_priv->tt.vn); /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part */ tt_len = -1; tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tvlv_tt_data, &tt_change, &tt_len); if (!tt_len || !tvlv_len) goto out; /* fill the rest of the tvlv with the real TT entries */ tvlv_len -= batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash, tt_change, tt_len, batadv_tt_local_valid, NULL); } tvlv_tt_data->flags = BATADV_TT_RESPONSE; tvlv_tt_data->ttvn = req_ttvn; if (full_table) tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_RESPONSE to %pM [%c] (ttvn: %u)\n", orig_node->orig, full_table ? 'F' : '.', req_ttvn); batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, req_src, BATADV_TVLV_TT, 1, tvlv_tt_data, tvlv_len); goto out; unlock: spin_unlock_bh(&bat_priv->tt.last_changeset_lock); out: spin_unlock_bh(&bat_priv->tt.commit_lock); batadv_orig_node_put(orig_node); batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; } /** * batadv_send_tt_response() - send reply to tt request * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *req_src, u8 *req_dst) { if (batadv_is_my_mac(bat_priv, req_dst)) return batadv_send_my_tt_response(bat_priv, tt_data, req_src); return batadv_send_other_tt_response(bat_priv, tt_data, req_src, req_dst); } static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { int i; int roams; for (i = 0; i < tt_num_changes; i++) { if ((tt_change + i)->flags & BATADV_TT_CLIENT_DEL) { roams = (tt_change + i)->flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_del(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), "tt removed by changes", roams); } else { if (!batadv_tt_global_add(bat_priv, orig_node, (tt_change + i)->addr, ntohs((tt_change + i)->vid), (tt_change + i)->flags, ttvn)) /* In case of problem while storing a * global_entry, we stop the updating * procedure without committing the * ttvn change. This will avoid to send * corrupted data on tt_request */ return; } } set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change, u8 ttvn, u8 *resp_src, u16 num_entries) { struct batadv_orig_node *orig_node; orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; /* Purge the old table first.. */ batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Received full table"); _batadv_tt_update_changes(bat_priv, orig_node, tt_change, num_entries, ttvn); spin_lock_bh(&orig_node->tt_buff_lock); kfree(orig_node->tt_buff); orig_node->tt_buff_len = 0; orig_node->tt_buff = NULL; spin_unlock_bh(&orig_node->tt_buff_lock); atomic_set(&orig_node->last_ttvn, ttvn); out: batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, u16 tt_num_changes, u8 ttvn, struct batadv_tvlv_tt_change *tt_change) { _batadv_tt_update_changes(bat_priv, orig_node, tt_change, tt_num_changes, ttvn); batadv_tt_save_orig_buffer(bat_priv, orig_node, tt_change, batadv_tt_len(tt_num_changes)); atomic_set(&orig_node->last_ttvn, ttvn); } /** * batadv_is_my_client() - check if a client is served by the local node * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; /* Check if the client has been logically deleted (but is kept for * consistency purpose) */ if ((tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) || (tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM)) goto out; ret = true; out: batadv_tt_local_entry_put(tt_local_entry); return ret; } /** * batadv_handle_tt_response() - process incoming tt reply * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @resp_src: mac address of tt reply sender * @num_entries: number of tt change entries appended to the tt data */ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *resp_src, u16 num_entries) { struct batadv_tt_req_node *node; struct hlist_node *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) goto out; spin_lock_bh(&orig_node->tt_lock); tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan)); tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr; if (tt_data->flags & BATADV_TT_FULL_TABLE) { batadv_tt_fill_gtable(bat_priv, tt_change, tt_data->ttvn, resp_src, num_entries); } else { batadv_tt_update_changes(bat_priv, orig_node, num_entries, tt_data->ttvn, tt_change); } /* Recalculate the CRC for this orig_node and store it */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* Delete the tt_req_node from pending tt_requests list */ spin_lock_bh(&bat_priv->tt.req_list_lock); hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; hlist_del_init(&node->list); batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); out: batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) { struct batadv_tt_roam_node *node, *safe; spin_lock_bh(&bat_priv->tt.roam_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { if (!batadv_has_timed_out(node->first_time, BATADV_ROAMING_MAX_TIME)) continue; list_del(&node->list); kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } /** * batadv_tt_check_roam_count() - check if a client has roamed too frequently * @bat_priv: the bat priv with all the mesh interface information * @client: mac address of the roaming client * * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { struct batadv_tt_roam_node *tt_roam_node; bool ret = false; spin_lock_bh(&bat_priv->tt.roam_list_lock); /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ list_for_each_entry(tt_roam_node, &bat_priv->tt.roam_list, list) { if (!batadv_compare_eth(tt_roam_node->addr, client)) continue; if (batadv_has_timed_out(tt_roam_node->first_time, BATADV_ROAMING_MAX_TIME)) continue; if (!batadv_atomic_dec_not_zero(&tt_roam_node->counter)) /* Sorry, you roamed too many times! */ goto unlock; ret = true; break; } if (!ret) { tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache, GFP_ATOMIC); if (!tt_roam_node) goto unlock; tt_roam_node->first_time = jiffies; atomic_set(&tt_roam_node->counter, BATADV_ROAMING_MAX_COUNT - 1); ether_addr_copy(tt_roam_node->addr, client); list_add(&tt_roam_node->list, &bat_priv->tt.roam_list); ret = true; } unlock: spin_unlock_bh(&bat_priv->tt.roam_list_lock); return ret; } /** * batadv_send_roam_adv() - send a roaming advertisement message * @bat_priv: the bat priv with all the mesh interface information * @client: mac address of the roaming client * @vid: VLAN identifier * @orig_node: message destination * * Send a ROAMING_ADV message to the node which was previously serving this * client. This is done to inform the node that from now on all traffic destined * for this particular roamed client has to be forwarded to the sender of the * roaming message. */ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node) { struct batadv_hard_iface *primary_if; struct batadv_tvlv_roam_adv tvlv_roam; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* before going on we have to check whether the client has * already roamed to us too many times */ if (!batadv_tt_check_roam_count(bat_priv, client)) goto out; batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n", orig_node->orig, client, batadv_print_vid(vid)); batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX); memcpy(tvlv_roam.client, client, sizeof(tvlv_roam.client)); tvlv_roam.vid = htons(vid); batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr, orig_node->orig, BATADV_TVLV_ROAM, 1, &tvlv_roam, sizeof(tvlv_roam)); out: batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_tt *priv_tt; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_tt = container_of(delayed_work, struct batadv_priv_tt, work); bat_priv = container_of(priv_tt, struct batadv_priv, tt); batadv_tt_local_purge(bat_priv, BATADV_TT_LOCAL_TIMEOUT); batadv_tt_global_purge(bat_priv); batadv_tt_req_purge(bat_priv); batadv_tt_roam_purge(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); } /** * batadv_tt_free() - Free translation table of mesh interface * @bat_priv: the bat priv with all the mesh interface information */ void batadv_tt_free(struct batadv_priv *bat_priv) { batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1); cancel_delayed_work_sync(&bat_priv->tt.work); batadv_tt_local_table_free(bat_priv); batadv_tt_global_table_free(bat_priv); batadv_tt_req_list_free(bat_priv); batadv_tt_changes_list_free(bat_priv); batadv_tt_roam_list_free(bat_priv); kfree(bat_priv->tt.last_changeset); } /** * batadv_tt_local_set_flags() - set or unset the specified flags on the local * table and possibly count them in the TT size * @bat_priv: the bat priv with all the mesh interface information * @flags: the flag to switch * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries */ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, bool enable, bool count) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; struct hlist_head *head; u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_common_entry, head, hash_entry) { if (enable) { if ((tt_common_entry->flags & flags) == flags) continue; tt_common_entry->flags |= flags; } else { if (!(tt_common_entry->flags & flags)) continue; tt_common_entry->flags &= ~flags; } if (!count) continue; batadv_tt_local_size_inc(bat_priv, tt_common_entry->vid); } rcu_read_unlock(); } } /* Purge out all the tt local entries marked with BATADV_TT_CLIENT_PENDING */ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ u32 i; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(tt_common, node_tmp, head, hash_entry) { if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING)) continue; batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting local tt entry (%pM, vid: %d): pending\n", tt_common->addr, batadv_print_vid(tt_common->vid)); batadv_tt_local_size_dec(bat_priv, tt_common->vid); hlist_del_rcu(&tt_common->hash_entry); tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } } /** * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes * which have been queued in the time since the last commit * @bat_priv: the bat priv with all the mesh interface information * * Caller must hold tt->commit_lock. */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { lockdep_assert_held(&bat_priv->tt.commit_lock); if (READ_ONCE(bat_priv->tt.local_changes) == 0) { if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) batadv_tt_tvlv_container_update(bat_priv); return; } batadv_tt_local_set_flags(bat_priv, BATADV_TT_CLIENT_NEW, false, true); batadv_tt_local_purge_pending_clients(bat_priv); batadv_tt_local_update_crc(bat_priv); /* Increment the TTVN only once per OGM interval */ atomic_inc(&bat_priv->tt.vn); batadv_dbg(BATADV_DBG_TT, bat_priv, "Local changes committed, updating to ttvn %u\n", (u8)atomic_read(&bat_priv->tt.vn)); /* reset the sending counter */ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX); batadv_tt_tvlv_container_update(bat_priv); } /** * batadv_tt_local_commit_changes() - commit all pending local tt changes which * have been queued in the time since the last commit * @bat_priv: the bat priv with all the mesh interface information */ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) { spin_lock_bh(&bat_priv->tt.commit_lock); batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped * @bat_priv: the bat priv with all the mesh interface information * @src: source mac address of packet * @dst: destination mac address of packet * @vid: vlan id of packet * * Return: true when src+dst(+vid) pair should be isolated, false otherwise */ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_global_entry *tt_global_entry; struct batadv_meshif_vlan *vlan; bool ret = false; vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) return false; if (!atomic_read(&vlan->ap_isolation)) goto vlan_put; tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid); if (!tt_local_entry) goto vlan_put; tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid); if (!tt_global_entry) goto local_entry_put; if (_batadv_is_ap_isolated(tt_local_entry, tt_global_entry)) ret = true; batadv_tt_global_entry_put(tt_global_entry); local_entry_put: batadv_tt_local_entry_put(tt_local_entry); vlan_put: batadv_meshif_vlan_put(vlan); return ret; } /** * batadv_tt_update_orig() - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig_node of the ogm * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, u16 tt_num_vlan, struct batadv_tvlv_tt_change *tt_change, u16 tt_num_changes, u8 ttvn) { u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes */ if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) { /* the OGM could not contain the changes due to their size or * because they have already been sent BATADV_TT_OGM_APPEND_MAX * times. * In this case send a tt request */ if (!tt_num_changes) { full_table = false; goto request_table; } spin_lock_bh(&orig_node->tt_lock); batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); /* Even if we received the precomputed crc with the OGM, we * prefer to recompute it to spot any possible inconsistency * in the global table */ batadv_tt_global_update_crc(bat_priv, orig_node); spin_unlock_bh(&orig_node->tt_lock); /* The ttvn alone is not enough to guarantee consistency * because a single value could represent different states * (due to the wrap around). Thus a node has to check whether * the resulting table (after applying the changes) is still * consistent or not. E.g. a node could disconnect while its * ttvn is X and reconnect on ttvn = X + TTVN_MAX: in this case * checking the CRC value is mandatory to detect the * inconsistency */ if (!batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) goto request_table; } else { /* if we missed more than one change or our tables are not * in sync anymore -> request fresh tt data */ if (!has_tt_init || ttvn != orig_ttvn || !batadv_tt_global_check_crc(orig_node, tt_vlan, tt_num_vlan)) { request_table: batadv_dbg(BATADV_DBG_TT, bat_priv, "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u num_changes: %u)\n", orig_node->orig, ttvn, orig_ttvn, tt_num_changes); batadv_send_tt_request(bat_priv, orig_node, ttvn, tt_vlan, tt_num_vlan, full_table); return; } } } /** * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; bool ret = false; tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global_entry) goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_global_entry_put(tt_global_entry); out: return ret; } /** * batadv_tt_local_client_is_roaming() - tells whether the client is roaming * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the local client to query * @vid: VLAN identifier * * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; batadv_tt_local_entry_put(tt_local_entry); out: return ret; } /** * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which the temporary entry should be associated with * @addr: mac address of the client * @vid: VLAN id of the new temporary global translation table * * Return: true when temporary tt entry could be added, false otherwise */ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid) { /* ignore loop detect macs, they are not supposed to be in the tt local * data as well. */ if (batadv_bla_is_loopdetect_mac(addr)) return false; if (!batadv_tt_global_add(bat_priv, orig_node, addr, vid, BATADV_TT_CLIENT_TEMP, atomic_read(&orig_node->last_ttvn))) return false; batadv_dbg(BATADV_DBG_TT, bat_priv, "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n", addr, batadv_print_vid(vid), orig_node->orig); return true; } /** * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the * maximum packet size that can be transported through the mesh * @mesh_iface: netdev struct of the mesh interface * * Remove entries older than 'timeout' and half timeout if more entries need * to be removed. */ void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); int packet_size_max = atomic_read(&bat_priv->packet_size_max); int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2; bool reduced = false; spin_lock_bh(&bat_priv->tt.commit_lock); while (timeout) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; batadv_tt_local_purge(bat_priv, timeout); batadv_tt_local_purge_pending_clients(bat_priv); timeout /= 2; reduced = true; net_ratelimited_function(batadv_info, mesh_iface, "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n", packet_size_max); } /* commit these changes immediately, to avoid synchronization problem * with the TTVN */ if (reduced) batadv_tt_local_commit_changes_nolock(bat_priv); spin_unlock_bh(&bat_priv->tt.commit_lock); } /** * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; size_t tt_data_sz; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; num_vlan = ntohs(tt_data->num_vlan); tt_data_sz = struct_size(tt_data, vlan_data, num_vlan); if (tvlv_value_len < tt_data_sz) return; tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data + tt_data_sz); tvlv_value_len -= tt_data_sz; num_entries = batadv_tt_entries(tvlv_value_len); batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan, tt_change, num_entries, tt_data->ttvn); } /** * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv * container * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_tt_data *tt_data; u16 tt_vlan_len, tt_num_entries; char tt_flag; bool ret; if (tvlv_value_len < sizeof(*tt_data)) return NET_RX_SUCCESS; tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); tt_vlan_len = flex_array_size(tt_data, vlan_data, ntohs(tt_data->num_vlan)); if (tvlv_value_len < tt_vlan_len) return NET_RX_SUCCESS; tvlv_value_len -= tt_vlan_len; tt_num_entries = batadv_tt_entries(tvlv_value_len); switch (tt_data->flags & BATADV_TT_DATA_TYPE_MASK) { case BATADV_TT_REQUEST: batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_RX); /* If this node cannot provide a TT response the tt_request is * forwarded */ ret = batadv_send_tt_response(bat_priv, tt_data, src, dst); if (!ret) { if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_REQUEST to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } break; case BATADV_TT_RESPONSE: batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX); if (batadv_is_my_mac(bat_priv, dst)) { batadv_handle_tt_response(bat_priv, tt_data, src, tt_num_entries); return NET_RX_SUCCESS; } if (tt_data->flags & BATADV_TT_FULL_TABLE) tt_flag = 'F'; else tt_flag = '.'; batadv_dbg(BATADV_DBG_TT, bat_priv, "Routing TT_RESPONSE to %pM [%c]\n", dst, tt_flag); /* tvlv API will re-route the packet */ return NET_RX_DROP; } return NET_RX_SUCCESS; } /** * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv * container * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_roam_adv *roaming_adv; struct batadv_orig_node *orig_node = NULL; /* If this node is not the intended recipient of the * roaming advertisement the packet is forwarded * (the tvlv API will re-route the packet). */ if (!batadv_is_my_mac(bat_priv, dst)) return NET_RX_DROP; if (tvlv_value_len < sizeof(*roaming_adv)) goto out; orig_node = batadv_orig_hash_find(bat_priv, src); if (!orig_node) goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); roaming_adv = tvlv_value; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received ROAMING_ADV from %pM (client %pM)\n", src, roaming_adv->client); batadv_tt_global_add(bat_priv, orig_node, roaming_adv->client, ntohs(roaming_adv->vid), BATADV_TT_CLIENT_ROAM, atomic_read(&orig_node->last_ttvn) + 1); out: batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } /** * batadv_tt_init() - initialise the translation table internals * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { int ret; /* synchronized flags must be remote */ BUILD_BUG_ON(!(BATADV_TT_SYNC_MASK & BATADV_TT_REMOTE_MASK)); ret = batadv_tt_local_init(bat_priv); if (ret < 0) return ret; ret = batadv_tt_global_init(bat_priv); if (ret < 0) { batadv_tt_local_table_free(bat_priv); return ret; } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_TT, 1, BATADV_NO_FLAGS); batadv_tvlv_handler_register(bat_priv, NULL, batadv_roam_tvlv_unicast_handler_v1, NULL, BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS); INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge); queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work, msecs_to_jiffies(BATADV_TT_WORK_PERIOD)); return 1; } /** * batadv_tt_global_is_isolated() - check if a client is marked as isolated * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt; bool ret; tt = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt) return false; ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; batadv_tt_global_entry_put(tt); return ret; } /** * batadv_tt_cache_init() - Initialize tt memory object cache * * Return: 0 on success or negative error number in case of failure. */ int __init batadv_tt_cache_init(void) { size_t tl_size = sizeof(struct batadv_tt_local_entry); size_t tg_size = sizeof(struct batadv_tt_global_entry); size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry); size_t tt_change_size = sizeof(struct batadv_tt_change_node); size_t tt_req_size = sizeof(struct batadv_tt_req_node); size_t tt_roam_size = sizeof(struct batadv_tt_roam_node); batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tl_cache) return -ENOMEM; batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tg_cache) goto err_tt_tl_destroy; batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache", tt_orig_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_orig_cache) goto err_tt_tg_destroy; batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache", tt_change_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_change_cache) goto err_tt_orig_destroy; batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache", tt_req_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_req_cache) goto err_tt_change_destroy; batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache", tt_roam_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (!batadv_tt_roam_cache) goto err_tt_req_destroy; return 0; err_tt_req_destroy: kmem_cache_destroy(batadv_tt_req_cache); batadv_tt_req_cache = NULL; err_tt_change_destroy: kmem_cache_destroy(batadv_tt_change_cache); batadv_tt_change_cache = NULL; err_tt_orig_destroy: kmem_cache_destroy(batadv_tt_orig_cache); batadv_tt_orig_cache = NULL; err_tt_tg_destroy: kmem_cache_destroy(batadv_tg_cache); batadv_tg_cache = NULL; err_tt_tl_destroy: kmem_cache_destroy(batadv_tl_cache); batadv_tl_cache = NULL; return -ENOMEM; } /** * batadv_tt_cache_destroy() - Destroy tt memory object cache */ void batadv_tt_cache_destroy(void) { kmem_cache_destroy(batadv_tl_cache); kmem_cache_destroy(batadv_tg_cache); kmem_cache_destroy(batadv_tt_orig_cache); kmem_cache_destroy(batadv_tt_change_cache); kmem_cache_destroy(batadv_tt_req_cache); kmem_cache_destroy(batadv_tt_roam_cache); }
4 1 3 3 3 3 3 1 2 2 1 1 1 9 9 8 1 2 3 1 1 1 3 3 3 2 1 1 1 5 1 4 1 471 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <linux/seq_file.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, [ICMPV6_MLD2_REPORT - 130] = 1, [ICMPV6_MRDISC_ADV - 130] = 1, [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmpv6_get_timeouts(struct net *net) { return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; if (state->pf != NFPROTO_IPV6) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } static noinline_for_stack int nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { u8 hl = ipv6_hdr(skb)->hop_limit; union nf_inet_addr outer_daddr; union { struct nd_opt_hdr nd_opt; struct rd_msg rd_msg; } tmp; const struct nd_opt_hdr *nd_opt; const struct rd_msg *rd_msg; rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); if (!rd_msg) { icmpv6_error_log(skb, state, "short redirect"); return -NF_ACCEPT; } if (rd_msg->icmph.icmp6_code != 0) return NF_ACCEPT; if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); return -NF_ACCEPT; } dataoff += sizeof(*rd_msg); /* warning: rd_msg no longer usable after this call */ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); if (!nd_opt || nd_opt->nd_opt_len == 0) { icmpv6_error_log(skb, state, "redirect without options"); return -NF_ACCEPT; } /* We could call ndisc_parse_options(), but it would need * skb_linearize() and a bit more work. */ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += 8; return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } if (icmp6h->icmp6_type == NDISC_REDIRECT) return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += sizeof(*icmp6h); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { if (!tb[CTA_PROTO_ICMPV6_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { if (!tb[CTA_PROTO_ICMPV6_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { if (!tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); } return 0; } static unsigned int icmpv6_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; } else { /* Set default ICMPv6 timeout. */ *timeout = in->timeout; } return 0; } static int icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); in->timeout = nf_ct_icmpv6_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
10067 10070 8386 265 263 182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/spinlock.h> #include <linux/atomic.h> /* * This is an implementation of the notion of "decrement a * reference count, and return locked if it decremented to zero". * * NOTE NOTE NOTE! This is _not_ equivalent to * * if (atomic_dec_and_test(&atomic)) { * spin_lock(&lock); * return 1; * } * return 0; * * because the spin-lock and the decrement must be * "atomic". */ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);
1 1 1 2 12 1 9 2 2 13 4 1 10 1 2 2 1 1 4 2 2 3 1 1 1 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ #include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/compiler.h> #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #if defined(__LITTLE_ENDIAN) #define CPU_BYTEORDER_STRING "little" #elif defined(__BIG_ENDIAN) #define CPU_BYTEORDER_STRING "big" #else #error Unknown byteorder #endif #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum)); } KERNEL_ATTR_RO(uevent_seqnum); /* cpu byteorder */ static ssize_t cpu_byteorder_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); } KERNEL_ATTR_RO(cpu_byteorder); /* address bits */ static ssize_t address_bits_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); } KERNEL_ATTR_RO(address_bits); #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; memcpy(uevent_helper, buf, count); uevent_helper[count] = '\0'; if (count && uevent_helper[count-1] == '\n') uevent_helper[count-1] = '\0'; return count; } KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", prof_on); } static ssize_t profiling_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int ret; static DEFINE_MUTEX(lock); /* * We need serialization, for profile_setup() initializes prof_on * value and profile_init() must not reallocate prof_buffer after * once allocated. */ guard(mutex)(&lock); if (prof_on) return -EEXIST; /* * This eventually calls into get_option() which * has a ton of callers and is not const. It is * easiest to cast it away here. */ profile_setup((char *)buf); ret = profile_init(); if (ret) return ret; ret = create_proc_profile(); if (ret) return ret; return count; } KERNEL_ATTR_RW(profiling); #endif #ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); #ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t size = crash_get_memory_size(); if (size < 0) return size; return sysfs_emit(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned long cnt; int ret; if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); return ret < 0 ? ret : count; } KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sysfs_emit(buf, "%pa %x\n", &vmcore_base, (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); #ifdef CONFIG_CRASH_HOTPLUG static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int sz = crash_get_elfcorehdr_size(); return sysfs_emit(buf, "%u\n", sz); } KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", file_caps_enabled); } KERNEL_ATTR_RO(fscaps); #ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_expedited)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_expedited); int rcu_normal; static ssize_t rcu_normal_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal)); } static ssize_t rcu_normal_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (kstrtoint(buf, 0, &rcu_normal)) return -EINVAL; return count; } KERNEL_ATTR_RW(rcu_normal); #endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ extern const void __start_notes; extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) static __ro_after_init BIN_ATTR_SIMPLE_RO(notes); struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, &cpu_byteorder_attr.attr, &address_bits_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, #ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif #endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, #endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, #endif NULL }; static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; static int __init ksysfs_init(void) { int error; kernel_kobj = kobject_create_and_add("kernel", NULL); if (!kernel_kobj) { error = -ENOMEM; goto exit; } error = sysfs_create_group(kernel_kobj, &kernel_attr_group); if (error) goto kset_exit; if (notes_size > 0) { bin_attr_notes.private = (void *)&__start_notes; bin_attr_notes.size = notes_size; error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes); if (error) goto group_exit; } return 0; group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: kobject_put(kernel_kobj); exit: return error; } core_initcall(ksysfs_init);
700 2702 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> #include <linux/kernel.h> #include <vdso/bits.h> #include <vdso/page.h> static inline void tlb_flush(struct mmu_gather *tlb) { unsigned long start = 0UL, end = TLB_FLUSH_ALL; unsigned int stride_shift = tlb_get_unmap_shift(tlb); if (!tlb->fullmm && !tlb->need_flush_all) { start = tlb->start; end = tlb->end; } flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } enum addr_stride { PTE_STRIDE = 0, PMD_STRIDE = 1 }; /* * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination * of the three. For example: * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address * - FLAG_PCID: invalidate all TLB entries matching the PCID * * The first is used to invalidate (kernel) mappings at a particular * address across all processes. * * The latter invalidates all TLB entries matching a PCID. */ #define INVLPGB_FLAG_VA BIT(0) #define INVLPGB_FLAG_PCID BIT(1) #define INVLPGB_FLAG_ASID BIT(2) #define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) #define INVLPGB_FLAG_FINAL_ONLY BIT(4) #define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) /* The implied mode when all bits are clear: */ #define INVLPGB_MODE_ALL_NONGLOBALS 0UL #ifdef CONFIG_BROADCAST_TLB_FLUSH /* * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. * * The INVLPGB instruction is weakly ordered, and a batch of invalidations can * be done in a parallel fashion. * * The instruction takes the number of extra pages to invalidate, beyond the * first page, while __invlpgb gets the more human readable number of pages to * invalidate. * * The bits in rax[0:2] determine respectively which components of the address * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* * address in the specified range matches. * * Since it is desired to only flush TLB entries for the ASID that is executing * the instruction (a host/hypervisor or a guest), the ASID valid bit should * always be set. On a host/hypervisor, the hardware will use the ASID value * specified in EDX[15:0] (which should be 0). On a guest, the hardware will * use the actual ASID value of the guest. * * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from * this CPU have completed. */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride stride, u8 flags) { u64 rax = addr | flags | INVLPGB_FLAG_ASID; u32 ecx = (stride << 31) | (nr_pages - 1); u32 edx = (pcid << 16) | asid; /* The low bits in rax are for flags. Verify addr is clean. */ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); /* INVLPGB; supported in binutils >= 2.36. */ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { __invlpgb(asid, pcid, 0, 1, 0, flags); } static inline void __tlbsync(void) { /* * TLBSYNC waits for INVLPGB instructions originating on the same CPU * to have completed. Print a warning if the task has been migrated, * and might not be waiting on all the INVLPGBs issued during this TLB * invalidation sequence. */ cant_migrate(); /* TLBSYNC: supported in binutils >= 0.36. */ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); } #else /* Some compilers (I'm looking at you clang!) simply can't do DCE */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride s, u8 flags) { } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } static inline void __tlbsync(void) { } #endif static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, unsigned long addr, u16 nr, bool stride) { enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; __invlpgb(0, pcid, addr, nr, str, flags); } /* Flush all mappings for a given PCID, not including globals. */ static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) { __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); } /* Flush all mappings, including globals, for all PCIDs. */ static inline void invlpgb_flush_all(void) { /* * TLBSYNC at the end needs to make sure all flushes done on the * current CPU have been executed system-wide. Therefore, make * sure nothing gets migrated in-between but disable preemption * as it is cheaper. */ guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); __tlbsync(); } /* Flush addr, including globals, for all PCIDs. */ static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) { __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); } /* Flush all mappings for all PCIDs except globals. */ static inline void invlpgb_flush_all_nonglobals(void) { guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); __tlbsync(); } #endif /* _ASM_X86_TLB_H */
7 7 7 7 7 7 7 7 7 7 80 81 81 81 80 81 80 80 81 81 81 7 7 7 81 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic TIME_WAIT sockets functions * * From code orinally in TCP */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer * * unhash a timewait socket from bind hash, if hashed. * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind2_bucket *tb2 = tw->tw_tb2; struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) return; __sk_del_bind_node((struct sock *)tw); tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); struct inet_bind_hashbucket *bhead, *bhead2; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, twsk_net(tw), tw->tw_num); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); refcount_dec(&tw->tw_dr->tw_refcount); inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } void inet_twsk_put(struct inet_timewait_sock *tw) { if (refcount_dec_and_test(&tw->tw_refcnt)) inet_twsk_free(tw); } EXPORT_SYMBOL_GPL(inet_twsk_put); static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&tw->tw_node, list); } static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, false); } /* * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. * * The caller must not access @tw anymore after this function returns. */ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo, int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead, *bhead2; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); local_bh_disable(); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); spin_lock(lock); /* Step 2: Hash TW into tcp ehash chain */ inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); /* Ensure above writes are committed into memory before updating the * refcount. * Provides ordering vs later refcount_inc(). */ smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); inet_twsk_schedule(tw, timeo); spin_unlock(lock); local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); inet_twsk_kill(tw); } struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state) { struct inet_timewait_sock *tw; if (refcount_read(&dr->tw_refcount) - 1 >= READ_ONCE(dr->sysctl_max_tw_buckets)) return NULL; tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; tw->tw_tos = inet->tos; tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; tw->tw_sport = inet->inet_sport; tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); } return tw; } EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. * Warning : consume reference. * Caller should not access tw anymore. */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); /* inet_twsk_purge() walks over all sockets, including tw ones, * and removes them via inet_twsk_deschedule_put() after a * refcount_inc_not_zero(). * * inet_twsk_hashdance_schedule() must (re)init the refcount before * arming the timer, i.e. inet_twsk_purge can obtain a reference to * a twsk that did not yet schedule the timer. * * The ehash lock synchronizes these two: * After acquiring the lock, the timer is always scheduled (else * timer_shutdown returns false), because hashdance_schedule releases * the ehash lock only after completing the timer initialization. * * Without grabbing the ehash lock, we get: * 1) cpu x sets twsk refcount to 3 * 2) cpu y bumps refcount to 4 * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown * -> timer refcount is never decremented. */ spin_lock(lock); /* Makes sure hashdance_schedule() has completed */ spin_unlock(lock); if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } EXPORT_SYMBOL(inet_twsk_deschedule_put); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. * * RATIONALE: if FIN arrived and we entered TIME-WAIT state, * our ACK acking that FIN can be lost. If N subsequent retransmitted * FINs (or previous seqments) are lost (probability of such event * is p^(N+1), where p is probability to lose single packet and * time to detect the loss is about RTO*(2^N - 1) with exponential * backoff). Normal timewait length is calculated so, that we * waited at least for one retransmitted FIN (maximal RTO is 120sec). * [ BTW Linux. following BSD, violates this requirement waiting * only for 60sec, we should wait at least for 240 secs. * Well, 240 consumes too much of resources 8) * ] * This interval is not reduced to catch old duplicate and * responces to our wandering segments living for two MSLs. * However, if we use PAWS to detect * old duplicates, we can reduce the interval to bounds required * by RTO, rather than MSL. So, if peer understands PAWS, we * kill tw bucket after 3.5*RTO (it is important that this number * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ if (!rearm) { bool kill = timeo <= 4*HZ; __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : LINUX_MIB_TIMEWAITED); BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); refcount_inc(&tw->tw_dr->tw_refcount); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) { struct inet_ehash_bucket *head = &hashinfo->ehash[0]; unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; for (slot = 0; slot <= ehash_mask; slot++, head++) { if (hlist_nulls_empty(&head->chain)) continue; restart_rcu: cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { int state = inet_sk_state_load(sk); if ((1 << state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV)) continue; if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } rcu_read_unlock(); local_bh_disable(); if (state == TCP_TIME_WAIT) { inet_twsk_deschedule_put(inet_twsk(sk)); } else { struct request_sock *req = inet_reqsk(sk); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); } local_bh_enable(); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto restart; rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(inet_twsk_purge);
56 55 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498