397 543 544 18 16 16 520 58 483 86 86 22 230 201 505 228 504 504 202 500 497 497 497 499 9 491 496 495 46 474 472 26 496 495 495 491 14 9 496 494 2 285 2 55 1 55 24 50 55 1 1 492 492 3 1 461 478 479 458 488 14 48 64 64 62 2 63 64 11 60 19 64 64 168 209 344 345 339 21 43 2 45 345 184 345 233 9 185 185 182 3 7 29 2 72 329 75 283 489 491 3 491 48 460 481 288 405 405 13 46 397 29 6 32 32 17 17 396 264 262 40 40 40 209 253 491 500 27 491 489 492 291 344 123 289 283 8 5 72 301 54 56 3 3 55 1 241 2 2 443 24 179 462 72 445 384 75 2 76 439 4 184 3 297 377 154 499 497 52 478 500 460 72 52 479 500 500 500 462 73 73 25 23 14 14 25 25 14 23 19 14 153 2 2 2 2 1 1 149 185 26 164 162 3 17 158 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 /* $NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $"); #ifdef _KERNEL_OPT #include "opt_magiclinks.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/syslimits.h> #include <sys/time.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/vnode_impl.h> #include <sys/fstrans.h> #include <sys/mount.h> #include <sys/errno.h> #include <sys/filedesc.h> #include <sys/hash.h> #include <sys/proc.h> #include <sys/syslog.h> #include <sys/kauth.h> #include <sys/ktrace.h> #include <sys/dirent.h> #ifndef MAGICLINKS #define MAGICLINKS 0 #endif int vfs_magiclinks = MAGICLINKS; __CTASSERT(MAXNAMLEN == NAME_MAX); /* * Substitute replacement text for 'magic' strings in symlinks. * Returns 0 if successful, and returns non-zero if an error * occurs. (Currently, the only possible error is running out * of temporary pathname space.) * * Looks for "@<string>" and "@<string>/", where <string> is a * recognized 'magic' string. Replaces the "@<string>" with the * appropriate replacement text. (Note that in some cases the * replacement text may have zero length.) * * This would have been table driven, but the variance in * replacement strings (and replacement string lengths) made * that impractical. */ #define VNL(x) \ (sizeof(x) - 1) #define VO '{' #define VC '}' #define MATCH(str) \ ((termchar == '/' && i + VNL(str) == *len) || \ (i + VNL(str) < *len && \ cp[i + VNL(str)] == termchar)) && \ !strncmp((str), &cp[i], VNL(str)) #define SUBSTITUTE(m, s, sl) \ if ((newlen + (sl)) >= MAXPATHLEN) \ return 1; \ i += VNL(m); \ if (termchar != '/') \ i++; \ (void)memcpy(&tmp[newlen], (s), (sl)); \ newlen += (sl); \ change = 1; \ termchar = '/'; static int symlink_magic(struct proc *p, char *cp, size_t *len) { char *tmp; size_t change, i, newlen, slen; char termchar = '/'; char idtmp[11]; /* enough for 32 bit *unsigned* integer */ tmp = PNBUF_GET(); for (change = i = newlen = 0; i < *len; ) { if (cp[i] != '@') { tmp[newlen++] = cp[i++]; continue; } i++; /* Check for @{var} syntax. */ if (cp[i] == VO) { termchar = VC; i++; } /* * The following checks should be ordered according * to frequency of use. */ if (MATCH("machine_arch")) { slen = strlen(PROC_MACHINE_ARCH(p)); SUBSTITUTE("machine_arch", PROC_MACHINE_ARCH(p), slen); } else if (MATCH("machine")) { slen = VNL(MACHINE); SUBSTITUTE("machine", MACHINE, slen); } else if (MATCH("hostname")) { SUBSTITUTE("hostname", hostname, hostnamelen); } else if (MATCH("osrelease")) { slen = strlen(osrelease); SUBSTITUTE("osrelease", osrelease, slen); } else if (MATCH("emul")) { slen = strlen(p->p_emul->e_name); SUBSTITUTE("emul", p->p_emul->e_name, slen); } else if (MATCH("kernel_ident")) { slen = strlen(kernel_ident); SUBSTITUTE("kernel_ident", kernel_ident, slen); } else if (MATCH("domainname")) { SUBSTITUTE("domainname", domainname, domainnamelen); } else if (MATCH("ostype")) { slen = strlen(ostype); SUBSTITUTE("ostype", ostype, slen); } else if (MATCH("uid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_geteuid(kauth_cred_get())); SUBSTITUTE("uid", idtmp, slen); } else if (MATCH("ruid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getuid(kauth_cred_get())); SUBSTITUTE("ruid", idtmp, slen); } else if (MATCH("gid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getegid(kauth_cred_get())); SUBSTITUTE("gid", idtmp, slen); } else if (MATCH("rgid")) { slen = snprintf(idtmp, sizeof(idtmp), "%u", kauth_cred_getgid(kauth_cred_get())); SUBSTITUTE("rgid", idtmp, slen); } else { tmp[newlen++] = '@'; if (termchar == VC) tmp[newlen++] = VO; } } if (change) { (void)memcpy(cp, tmp, newlen); *len = newlen; } PNBUF_PUT(tmp); return 0; } #undef VNL #undef VO #undef VC #undef MATCH #undef SUBSTITUTE //////////////////////////////////////////////////////////// /* * Determine the namei hash (for the namecache) for name. * If *ep != NULL, hash from name to ep-1. * If *ep == NULL, hash from name until the first NUL or '/', and * return the location of this termination character in *ep. * * This function returns an equivalent hash to the MI hash32_strn(). * The latter isn't used because in the *ep == NULL case, determining * the length of the string to the first NUL or `/' and then calling * hash32_strn() involves unnecessary double-handling of the data. */ uint32_t namei_hash(const char *name, const char **ep) { uint32_t hash; hash = HASH32_STR_INIT; if (*ep != NULL) { for (; name < *ep; name++) hash = hash * 33 + *(const uint8_t *)name; } else { for (; *name != '\0' && *name != '/'; name++) hash = hash * 33 + *(const uint8_t *)name; *ep = name; } return (hash + (hash >> 5)); } //////////////////////////////////////////////////////////// /* * Sealed abstraction for pathnames. * * System-call-layer level code that is going to call namei should * first create a pathbuf and adjust all the bells and whistles on it * as needed by context. */ struct pathbuf { char *pb_path; char *pb_pathcopy; unsigned pb_pathcopyuses; }; static struct pathbuf * pathbuf_create_raw(void) { struct pathbuf *pb; pb = kmem_alloc(sizeof(*pb), KM_SLEEP); pb->pb_path = PNBUF_GET(); if (pb->pb_path == NULL) { kmem_free(pb, sizeof(*pb)); return NULL; } pb->pb_pathcopy = NULL; pb->pb_pathcopyuses = 0; return pb; } void pathbuf_destroy(struct pathbuf *pb) { KASSERT(pb->pb_pathcopyuses == 0); KASSERT(pb->pb_pathcopy == NULL); PNBUF_PUT(pb->pb_path); kmem_free(pb, sizeof(*pb)); } struct pathbuf * pathbuf_assimilate(char *pnbuf) { struct pathbuf *pb; pb = kmem_alloc(sizeof(*pb), KM_SLEEP); pb->pb_path = pnbuf; pb->pb_pathcopy = NULL; pb->pb_pathcopyuses = 0; return pb; } struct pathbuf * pathbuf_create(const char *path) { struct pathbuf *pb; int error; pb = pathbuf_create_raw(); if (pb == NULL) { return NULL; } error = copystr(path, pb->pb_path, PATH_MAX, NULL); if (error != 0) { KASSERT(!"kernel path too long in pathbuf_create"); /* make sure it's null-terminated, just in case */ pb->pb_path[PATH_MAX-1] = '\0'; } return pb; } int pathbuf_copyin(const char *userpath, struct pathbuf **ret) { struct pathbuf *pb; int error; pb = pathbuf_create_raw(); if (pb == NULL) { return ENOMEM; } error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL); if (error) { pathbuf_destroy(pb); return error; } *ret = pb; return 0; } /* * XXX should not exist: * 1. whether a pointer is kernel or user should be statically checkable. * 2. copyin should be handled by the upper part of the syscall layer, * not in here. */ int pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret) { if (seg == UIO_USERSPACE) { return pathbuf_copyin(path, ret); } else { *ret = pathbuf_create(path); if (*ret == NULL) { return ENOMEM; } return 0; } } /* * Get a copy of the path buffer as it currently exists. If this is * called after namei starts the results may be arbitrary. */ void pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen) { strlcpy(buf, pb->pb_path, maxlen); } /* * These two functions allow access to a saved copy of the original * path string. The first copy should be gotten before namei is * called. Each copy that is gotten should be put back. */ const char * pathbuf_stringcopy_get(struct pathbuf *pb) { if (pb->pb_pathcopyuses == 0) { pb->pb_pathcopy = PNBUF_GET(); strcpy(pb->pb_pathcopy, pb->pb_path); } pb->pb_pathcopyuses++; return pb->pb_pathcopy; } void pathbuf_stringcopy_put(struct pathbuf *pb, const char *str) { KASSERT(str == pb->pb_pathcopy); KASSERT(pb->pb_pathcopyuses > 0); pb->pb_pathcopyuses--; if (pb->pb_pathcopyuses == 0) { PNBUF_PUT(pb->pb_pathcopy); pb->pb_pathcopy = NULL; } } //////////////////////////////////////////////////////////// /* * namei: convert a pathname into a pointer to a (maybe-locked) vnode, * and maybe also its parent directory vnode, and assorted other guff. * See namei(9) for the interface documentation. * * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is passed in. The pathname is descended * until done, or a symbolic link is encountered. The variable ni_more * is clear if the path is completed; it is set to one if a symbolic * link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. Otherwise the parent directory is not returned. If the target * of the pathname exists and LOCKLEAF is or'ed into the flag the target * is returned locked, otherwise it is returned unlocked. When creating * or renaming and LOCKPARENT is specified, the target may not be ".". * When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, locked. * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp */ /* * Internal state for a namei operation. * * cnp is always equal to &ndp->ni_cnp. */ struct namei_state { struct nameidata *ndp; struct componentname *cnp; int docache; /* == 0 do not cache last component */ int rdonly; /* lookup read-only flag bit */ int slashes; unsigned attempt_retry:1; /* true if error allows emul retry */ unsigned root_referenced:1; /* true if ndp->ni_rootdir and ndp->ni_erootdir were referenced */ }; /* * Initialize the namei working state. */ static void namei_init(struct namei_state *state, struct nameidata *ndp) { state->ndp = ndp; state->cnp = &ndp->ni_cnd; state->docache = 0; state->rdonly = 0; state->slashes = 0; state->root_referenced = 0; KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc"); KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0), "namei: nameiop contaminated with flags: %08"PRIx32, state->cnp->cn_nameiop); KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0), "name: flags contaminated with nameiops: %08"PRIx32, state->cnp->cn_flags); /* * The buffer for name translation shall be the one inside the * pathbuf. */ state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path; } /* * Clean up the working namei state, leaving things ready for return * from namei. */ static void namei_cleanup(struct namei_state *state) { KASSERT(state->cnp == &state->ndp->ni_cnd); if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir); } } ////////////////////////////// /* * Get the directory context. * Initializes the rootdir and erootdir state and returns a reference * to the starting dir. */ static struct vnode * namei_getstartdir(struct namei_state *state) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct cwdinfo *cwdi; /* pointer to cwd state */ struct lwp *self = curlwp; /* thread doing namei() */ struct vnode *rootdir, *erootdir, *curdir, *startdir; if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir); state->root_referenced = 0; } cwdi = self->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); /* root dir */ if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) { rootdir = rootvnode; } else { rootdir = cwdi->cwdi_rdir; } /* emulation root dir, if any */ if ((cnp->cn_flags & TRYEMULROOT) == 0) { /* if we don't want it, don't fetch it */ erootdir = NULL; } else if (cnp->cn_flags & EMULROOTSET) { /* explicitly set emulroot; "/../" doesn't override this */ erootdir = ndp->ni_erootdir; } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) { /* explicit reference to real rootdir */ erootdir = NULL; } else { /* may be null */ erootdir = cwdi->cwdi_edir; } /* current dir */ curdir = cwdi->cwdi_cdir; if (ndp->ni_pnbuf[0] != '/') { if (ndp->ni_atdir != NULL) { startdir = ndp->ni_atdir; } else { startdir = curdir; } erootdir = NULL; } else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) { startdir = erootdir; } else { startdir = rootdir; erootdir = NULL; } state->ndp->ni_rootdir = rootdir; state->ndp->ni_erootdir = erootdir; /* * Get a reference to the start dir so we can safely unlock cwdi. * * Must hold references to rootdir and erootdir while we're running. * A multithreaded process may chroot during namei. */ if (startdir != NULL) vref(startdir); if (state->ndp->ni_rootdir != NULL) vref(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vref(state->ndp->ni_erootdir); state->root_referenced = 1; rw_exit(&cwdi->cwdi_lock); return startdir; } /* * Get the directory context for the nfsd case, in parallel to * getstartdir. Initializes the rootdir and erootdir state and * returns a reference to the passed-in starting dir. */ static struct vnode * namei_getstartdir_for_nfsd(struct namei_state *state) { KASSERT(state->ndp->ni_atdir != NULL); /* always use the real root, and never set an emulation root */ if (rootvnode == NULL) { return NULL; } state->ndp->ni_rootdir = rootvnode; state->ndp->ni_erootdir = NULL; vref(state->ndp->ni_atdir); KASSERT(! state->root_referenced); vref(state->ndp->ni_rootdir); state->root_referenced = 1; return state->ndp->ni_atdir; } /* * Ktrace the namei operation. */ static void namei_ktrace(struct namei_state *state) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct lwp *self = curlwp; /* thread doing namei() */ const char *emul_path; if (ktrpoint(KTR_NAMEI)) { if (ndp->ni_erootdir != NULL) { /* * To make any sense, the trace entry need to have the * text of the emulation path prepended. * Usually we can get this from the current process, * but when called from emul_find_interp() it is only * in the exec_package - so we get it passed in ni_next * (this is a hack). */ if (cnp->cn_flags & EMULROOTSET) emul_path = ndp->ni_next; else emul_path = self->l_proc->p_emul->e_path; ktrnamei2(emul_path, strlen(emul_path), ndp->ni_pnbuf, ndp->ni_pathlen); } else ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen); } } /* * Start up namei. Find the root dir and cwd, establish the starting * directory for lookup, and lock it. Also calls ktrace when * appropriate. */ static int namei_start(struct namei_state *state, int isnfsd, struct vnode **startdir_ret) { struct nameidata *ndp = state->ndp; struct vnode *startdir; /* length includes null terminator (was originally from copyinstr) */ ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1; /* * POSIX.1 requirement: "" is not a valid file name. */ if (ndp->ni_pathlen == 1) { ndp->ni_erootdir = NULL; return ENOENT; } ndp->ni_loopcnt = 0; /* Get starting directory, set up root, and ktrace. */ if (isnfsd) { startdir = namei_getstartdir_for_nfsd(state); /* no ktrace */ } else { startdir = namei_getstartdir(state); namei_ktrace(state); } if (startdir == NULL) { return ENOENT; } /* NDAT may feed us with a non directory namei_getstartdir */ if (startdir->v_type != VDIR) { vrele(startdir); return ENOTDIR; } *startdir_ret = startdir; return 0; } /* * Check for being at a symlink that we're going to follow. */ static inline int namei_atsymlink(struct namei_state *state, struct vnode *foundobj) { return (foundobj->v_type == VLNK) && (state->cnp->cn_flags & (FOLLOW|REQUIREDIR)); } /* * Follow a symlink. * * Updates searchdir. inhibitmagic causes magic symlinks to not be * interpreted; this is used by nfsd. * * Unlocks foundobj on success (ugh) */ static inline int namei_follow(struct namei_state *state, int inhibitmagic, struct vnode *searchdir, struct vnode *foundobj, struct vnode **newsearchdir_ret) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct lwp *self = curlwp; /* thread doing namei() */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; char *cp; /* pointer into pathname argument */ size_t linklen; int error; if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { return ELOOP; } vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) { error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred); if (error != 0) { VOP_UNLOCK(foundobj); return error; } } /* FUTURE: fix this to not use a second buffer */ cp = PNBUF_GET(); aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_resid = MAXPATHLEN; UIO_SETUP_SYSSPACE(&auio); error = VOP_READLINK(foundobj, &auio, cnp->cn_cred); VOP_UNLOCK(foundobj); if (error) { PNBUF_PUT(cp); return error; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { PNBUF_PUT(cp); return ENOENT; } /* * Do symlink substitution, if appropriate, and * check length for potential overflow. * * Inhibit symlink substitution for nfsd. * XXX: This is how it was before; is that a bug or a feature? */ if ((!inhibitmagic && vfs_magiclinks && symlink_magic(self->l_proc, cp, &linklen)) || (linklen + ndp->ni_pathlen >= MAXPATHLEN)) { PNBUF_PUT(cp); return ENAMETOOLONG; } if (ndp->ni_pathlen > 1) { /* includes a null-terminator */ memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen); } else { cp[linklen] = '\0'; } ndp->ni_pathlen += linklen; memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen); PNBUF_PUT(cp); /* we're now starting from the beginning of the buffer again */ cnp->cn_nameptr = ndp->ni_pnbuf; /* * Check if root directory should replace current directory. */ if (ndp->ni_pnbuf[0] == '/') { vrele(searchdir); /* Keep absolute symbolic links inside emulation root */ searchdir = ndp->ni_erootdir; if (searchdir == NULL || (ndp->ni_pnbuf[1] == '.' && ndp->ni_pnbuf[2] == '.' && ndp->ni_pnbuf[3] == '/')) { ndp->ni_erootdir = NULL; searchdir = ndp->ni_rootdir; } vref(searchdir); while (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } } *newsearchdir_ret = searchdir; return 0; } ////////////////////////////// /* * Inspect the leading path component and update the state accordingly. */ static int lookup_parsepath(struct namei_state *state, struct vnode *searchdir) { const char *cp; /* pointer into pathname argument */ int error; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; KASSERT(cnp == &ndp->ni_cnd); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. * * At this point, our only vnode state is that the search dir * is held. */ error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen); if (error) { return error; } cp = cnp->cn_nameptr + cnp->cn_namelen; if (cnp->cn_namelen > KERNEL_NAME_MAX) { return ENAMETOOLONG; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *(char *)cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *(char *)cp = c; } #endif /* NAMEI_DIAGNOSTIC */ ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * If this component is followed by a slash, then move the pointer to * the next component forward, and remember that this component must be * a directory. */ if (*cp == '/') { do { cp++; } while (*cp == '/'); state->slashes = cp - ndp->ni_next; ndp->ni_pathlen -= state->slashes; ndp->ni_next = cp; cnp->cn_flags |= REQUIREDIR; } else { state->slashes = 0; cnp->cn_flags &= ~REQUIREDIR; } /* * We do special processing on the last component, whether or not it's * a directory. Cache all intervening lookups, but not the final one. */ if (*cp == '\0') { if (state->docache) cnp->cn_flags |= MAKEENTRY; else cnp->cn_flags &= ~MAKEENTRY; cnp->cn_flags |= ISLASTCN; } else { cnp->cn_flags |= MAKEENTRY; cnp->cn_flags &= ~ISLASTCN; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; return 0; } /* * Take care of crossing a mounted-on vnode. On error, foundobj_ret will be * vrele'd, but searchdir is left alone. */ static int lookup_crossmount(struct namei_state *state, struct vnode **searchdir_ret, struct vnode **foundobj_ret, bool *searchdir_locked) { struct componentname *cnp = state->cnp; struct vnode *foundobj, *vp; struct vnode *searchdir; struct mount *mp; int error, lktype; searchdir = *searchdir_ret; foundobj = *foundobj_ret; error = 0; KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0); /* First, unlock searchdir (oof). */ if (*searchdir_locked) { KASSERT(searchdir != NULL); lktype = VOP_ISLOCKED(searchdir); VOP_UNLOCK(searchdir); *searchdir_locked = false; } else { lktype = LK_NONE; } /* * Do an unlocked check to see if the vnode has been mounted on; if * so find the root of the mounted file system. */ while (foundobj->v_type == VDIR && (mp = foundobj->v_mountedhere) != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { /* * Try the namecache first. If that doesn't work, do * it the hard way. */ if (cache_lookup_mount(foundobj, &vp)) { vrele(foundobj); foundobj = vp; } else { /* First get the vnodes mount stable. */ while ((mp = foundobj->v_mountedhere) != NULL) { fstrans_start(mp); if (fstrans_held(mp) && mp == foundobj->v_mountedhere) { break; } fstrans_done(mp); } if (mp == NULL) { break; } /* * Now get a reference on the root vnode. * XXX Future - maybe allow only VDIR here. */ error = VFS_ROOT(mp, LK_NONE, &vp); /* * If successful, enter it into the cache while * holding the mount busy (competing with unmount). */ if (error == 0) { cache_enter_mount(foundobj, vp); } /* Finally, drop references to foundobj & mountpoint. */ vrele(foundobj); fstrans_done(mp); if (error) { foundobj = NULL; break; } foundobj = vp; } /* * Avoid locking vnodes from two filesystems because * it's prone to deadlock, e.g. when using puffs. * Also, it isn't a good idea to propagate slowness of * a filesystem up to the root directory. For now, * only handle the common case, where foundobj is * VDIR. * * In this case set searchdir to null to avoid using * it again. It is not correct to set searchdir == * foundobj here as that will confuse the caller. * (See PR 40740.) */ if (searchdir == NULL) { /* already been here once; do nothing further */ } else if (foundobj->v_type == VDIR) { vrele(searchdir); *searchdir_ret = searchdir = NULL; lktype = LK_NONE; } } /* If searchdir is still around, re-lock it. */ if (error == 0 && lktype != LK_NONE) { vn_lock(searchdir, lktype | LK_RETRY); *searchdir_locked = true; } *foundobj_ret = foundobj; return error; } /* * Determine the desired locking mode for the directory of a lookup. */ static int lookup_lktype(struct vnode *searchdir, struct componentname *cnp) { /* * If the file system supports VOP_LOOKUP() with a shared lock, and * we are not making any modifications (nameiop LOOKUP) or this is * not the last component then get a shared lock. Where we can't do * fast-forwarded lookups (for example with layered file systems) * then this is the fallback for reducing lock contention. */ if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) { return LK_SHARED; } else { return LK_EXCLUSIVE; } } /* * Call VOP_LOOKUP for a single lookup; return a new search directory * (used when crossing mountpoints up or searching union mounts down) and * the found object, which for create operations may be NULL on success. * * Note that the new search directory may be null, which means the * searchdir was unlocked and released. This happens in the common case * when crossing a mount point downwards, in order to avoid coupling * locks between different file system volumes. Importantly, this can * happen even if the call fails. (XXX: this is gross and should be * tidied somehow.) */ static int lookup_once(struct namei_state *state, struct vnode *searchdir, struct vnode **newsearchdir_ret, struct vnode **foundobj_ret, bool *newsearchdir_locked_ret) { struct vnode *tmpvn; /* scratch vnode */ struct vnode *foundobj; /* result */ struct lwp *l = curlwp; bool searchdir_locked = false; int error, lktype; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; KASSERT(cnp == &ndp->ni_cnd); *newsearchdir_ret = searchdir; /* * Handle "..": two special cases. * 1. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 1a. If at the root of the emulation filesystem go to the real * root. So "/../<path>" is always absolute. * 1b. If we have somehow gotten out of a jail, warn * and also ignore it so we can't get farther out. * 2. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other file system. */ if (cnp->cn_flags & ISDOTDOT) { struct proc *p = l->l_proc; for (;;) { if (searchdir == ndp->ni_rootdir || searchdir == rootvnode) { foundobj = searchdir; vref(foundobj); *foundobj_ret = foundobj; if (cnp->cn_flags & LOCKPARENT) { lktype = lookup_lktype(searchdir, cnp); vn_lock(searchdir, lktype | LK_RETRY); searchdir_locked = true; } error = 0; goto done; } if (ndp->ni_rootdir != rootvnode) { int retval; retval = vn_isunder(searchdir, ndp->ni_rootdir, l); if (!retval) { /* Oops! We got out of jail! */ log(LOG_WARNING, "chrooted pid %d uid %d (%s) " "detected outside of its chroot\n", p->p_pid, kauth_cred_geteuid(l->l_cred), p->p_comm); /* Put us at the jail root. */ vrele(searchdir); searchdir = NULL; foundobj = ndp->ni_rootdir; vref(foundobj); vref(foundobj); *newsearchdir_ret = foundobj; *foundobj_ret = foundobj; error = 0; goto done; } } if ((searchdir->v_vflag & VV_ROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) break; tmpvn = searchdir; searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); vrele(tmpvn); *newsearchdir_ret = searchdir; } } lktype = lookup_lktype(searchdir, cnp); /* * We now have a segment name to search for, and a directory to search. * Our vnode state here is that "searchdir" is held. */ unionlookup: foundobj = NULL; if (!searchdir_locked) { vn_lock(searchdir, lktype | LK_RETRY); searchdir_locked = true; } error = VOP_LOOKUP(searchdir, &foundobj, cnp); if (error != 0) { KASSERTMSG((foundobj == NULL), "leaf `%s' should be empty but is %p", cnp->cn_nameptr, foundobj); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif /* NAMEI_DIAGNOSTIC */ /* * If ENOLCK, the file system needs us to retry the lookup * with an exclusive lock. It's likely nothing was found in * cache and/or modifications need to be made. */ if (error == ENOLCK) { KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); KASSERT(searchdir_locked); if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { VOP_UNLOCK(searchdir); searchdir_locked = false; } lktype = LK_EXCLUSIVE; goto unionlookup; } if ((error == ENOENT) && (searchdir->v_vflag & VV_ROOT) && (searchdir->v_mount->mnt_flag & MNT_UNION)) { tmpvn = searchdir; searchdir = searchdir->v_mount->mnt_vnodecovered; vref(searchdir); vput(tmpvn); searchdir_locked = false; *newsearchdir_ret = searchdir; goto unionlookup; } if (error != EJUSTRETURN) goto done; /* * If this was not the last component, or there were trailing * slashes, and we are not going to create a directory, * then the name must exist. */ if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) { error = ENOENT; goto done; } /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (state->rdonly) { error = EROFS; goto done; } /* * We return success and a NULL foundobj to indicate * that the entry doesn't currently exist, leaving a * pointer to the (normally, locked) directory vnode * as searchdir. */ *foundobj_ret = NULL; error = 0; goto done; } #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif /* NAMEI_DIAGNOSTIC */ /* Unlock, unless the caller needs the parent locked. */ if (searchdir != NULL) { KASSERT(searchdir_locked); if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) != (ISLASTCN | LOCKPARENT)) { VOP_UNLOCK(searchdir); searchdir_locked = false; } } else { KASSERT(!searchdir_locked); } *foundobj_ret = foundobj; error = 0; done: *newsearchdir_locked_ret = searchdir_locked; return error; } /* * Parse out the first path name component that we need to to consider. * * While doing this, attempt to use the name cache to fast-forward through * as many "easy" to find components of the path as possible. * * We use the namecache's node locks to form a chain, and avoid as many * vnode references and locks as possible. In the ideal case, only the * final vnode will have its reference count adjusted and lock taken. */ static int lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret, struct vnode **foundobj_ret) { struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; krwlock_t *plock; struct vnode *foundobj, *searchdir; int error, error2; size_t oldpathlen; const char *oldnameptr; bool terminal; /* * Eat as many path name components as possible before giving up and * letting lookup_once() handle it. Remember the starting point in * case we can't get vnode references and need to roll back. */ plock = NULL; searchdir = *searchdir_ret; oldnameptr = cnp->cn_nameptr; oldpathlen = ndp->ni_pathlen; terminal = false; for (;;) { foundobj = NULL; /* * Get the next component name. There should be no slashes * here, and we shouldn't have looped around if we were * done. */ KASSERT(cnp->cn_nameptr[0] != '/'); KASSERT(cnp->cn_nameptr[0] != '\0'); if ((error = lookup_parsepath(state, searchdir)) != 0) { break; } /* * Can't deal with DOTDOT lookups if NOCROSSMOUNT or the * lookup is chrooted. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { if ((searchdir->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT)) { error = EOPNOTSUPP; break; } if (ndp->ni_rootdir != rootvnode) { error = EOPNOTSUPP; break; } } /* * Can't deal with last component when modifying; this needs * searchdir locked and VOP_LOOKUP() called (which can and * does modify state, despite the name). NB: this case means * terminal is never set true when LOCKPARENT. */ if ((cnp->cn_flags & ISLASTCN) != 0) { if (cnp->cn_nameiop != LOOKUP || (cnp->cn_flags & LOCKPARENT) != 0) { error = EOPNOTSUPP; break; } } /* * Good, now look for it in cache. cache_lookup_linked() * will fail if there's nothing there, or if there's no * ownership info for the directory, or if the user doesn't * have permission to look up files in this directory. */ if (!cache_lookup_linked(searchdir, cnp->cn_nameptr, cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) { error = EOPNOTSUPP; break; } KASSERT(plock != NULL); KASSERT(rw_lock_held(plock)); /* * Scored a hit. Negative is good too (ENOENT). If there's * a '-o union' mount here, punt and let lookup_once() deal * with it. */ if (foundobj == NULL) { if ((searchdir->v_vflag & VV_ROOT) != 0 && (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) { error = EOPNOTSUPP; } else { error = ENOENT; terminal = ((cnp->cn_flags & ISLASTCN) != 0); } break; } /* * Stop and get a hold on the vnode if we've encountered * something other than a dirctory. */ if (foundobj->v_type != VDIR) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } else { terminal = (foundobj->v_type != VLNK && (cnp->cn_flags & ISLASTCN) != 0); } break; } /* * Try to cross mountpoints, bearing in mind that they can * be stacked. If at any point we can't go further, stop * and try to get a reference on the vnode. If we are able * to get a ref then lookup_crossmount() will take care of * it, otherwise we'll fall through to lookup_once(). */ if (foundobj->v_mountedhere != NULL) { while (foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0 && cache_cross_mount(&foundobj, &plock)) { KASSERT(foundobj != NULL); KASSERT(foundobj->v_type == VDIR); } if (foundobj->v_mountedhere != NULL) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } break; } else { searchdir = NULL; } } /* * Time to stop if we found the last component & traversed * all mounts. */ if ((cnp->cn_flags & ISLASTCN) != 0) { error = vcache_tryvget(foundobj); if (error != 0) { foundobj = NULL; error = EOPNOTSUPP; } else { terminal = (foundobj->v_type != VLNK); } break; } /* * Otherwise, we're still in business. Set the found VDIR * vnode as the search dir for the next component and * continue on to it. */ cnp->cn_nameptr = ndp->ni_next; searchdir = foundobj; } if (terminal) { /* * If we exited the loop above having successfully located * the last component with a zero error code, and it's not a * symbolic link, then the parent directory is not needed. * Release reference to the starting parent and make the * terminal parent disappear into thin air. */ KASSERT(plock != NULL); rw_exit(plock); vrele(*searchdir_ret); *searchdir_ret = NULL; } else if (searchdir != *searchdir_ret) { /* * Otherwise we need to return the parent. If we ended up * with a new search dir, ref it before dropping the * namecache's lock. The lock prevents both searchdir and * foundobj from disappearing. If we can't ref the new * searchdir, we have a bit of a problem. Roll back the * fastforward to the beginning and let lookup_once() take * care of it. */ if (searchdir == NULL) { /* * It's possible for searchdir to be NULL in the * case of a root vnode being reclaimed while * trying to cross a mount. */ error2 = EOPNOTSUPP; } else { error2 = vcache_tryvget(searchdir); } KASSERT(plock != NULL); rw_exit(plock); if (__predict_true(error2 == 0)) { /* Returning new searchdir, and maybe new foundobj. */ vrele(*searchdir_ret); *searchdir_ret = searchdir; } else { /* Returning nothing. */ if (foundobj != NULL) { vrele(foundobj); foundobj = NULL; } cnp->cn_nameptr = oldnameptr; ndp->ni_pathlen = oldpathlen; error = lookup_parsepath(state, *searchdir_ret); if (error == 0) { error = EOPNOTSUPP; } } } else if (plock != NULL) { /* Drop any namecache lock still held. */ rw_exit(plock); } KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL); *foundobj_ret = foundobj; return error; } ////////////////////////////// /* * Do a complete path search from a single root directory. * (This is called up to twice if TRYEMULROOT is in effect.) */ static int namei_oneroot(struct namei_state *state, int neverfollow, int inhibitmagic, int isnfsd) { struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; struct vnode *searchdir, *foundobj; bool searchdir_locked = false; int error; error = namei_start(state, isnfsd, &searchdir); if (error) { ndp->ni_dvp = NULL; ndp->ni_vp = NULL; return error; } KASSERT(searchdir->v_type == VDIR); /* * Setup: break out flag bits into variables. */ state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE) state->docache = 0; state->rdonly = cnp->cn_flags & RDONLY; /* * Keep going until we run out of path components. */ cnp->cn_nameptr = ndp->ni_pnbuf; /* drop leading slashes (already used them to choose startdir) */ while (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } /* was it just "/"? */ if (cnp->cn_nameptr[0] == '\0') { foundobj = searchdir; searchdir = NULL; cnp->cn_flags |= ISLASTCN; /* bleh */ goto skiploop; } for (;;) { KASSERT(searchdir != NULL); KASSERT(!searchdir_locked); /* * Parse out the first path name component that we need to * to consider. While doing this, attempt to use the name * cache to fast-forward through as many "easy" to find * components of the path as possible. */ error = lookup_fastforward(state, &searchdir, &foundobj); /* * If we didn't get a good answer from the namecache, then * go directly to the file system. */ if (error == EOPNOTSUPP) { error = lookup_once(state, searchdir, &searchdir, &foundobj, &searchdir_locked); } /* * If the vnode we found is mounted on, then cross the mount * and get the root vnode in foundobj. If this encounters * an error, it will dispose of foundobj, but searchdir is * untouched. */ if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { error = lookup_crossmount(state, &searchdir, &foundobj, &searchdir_locked); } if (error) { if (searchdir != NULL) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } ndp->ni_dvp = NULL; ndp->ni_vp = NULL; /* * Note that if we're doing TRYEMULROOT we can * retry with the normal root. Where this is * currently set matches previous practice, * but the previous practice didn't make much * sense and somebody should sit down and * figure out which cases should cause retry * and which shouldn't. XXX. */ state->attempt_retry = 1; return (error); } if (foundobj == NULL) { /* * Success with no object returned means we're * creating something and it isn't already * there. Break out of the main loop now so * the code below doesn't have to test for * foundobj == NULL. */ /* lookup_once can't have dropped the searchdir */ KASSERT(searchdir != NULL || (cnp->cn_flags & ISLASTCN) != 0); break; } /* * Check for symbolic link. If we've reached one, * follow it, unless we aren't supposed to. Back up * over any slashes that we skipped, as we will need * them again. */ if (namei_atsymlink(state, foundobj)) { /* Don't need searchdir locked any more. */ if (searchdir_locked) { searchdir_locked = false; VOP_UNLOCK(searchdir); } ndp->ni_pathlen += state->slashes; ndp->ni_next -= state->slashes; if (neverfollow) { error = EINVAL; } else if (searchdir == NULL) { /* * dholland 20160410: lookup_once only * drops searchdir if it crossed a * mount point. Therefore, if we get * here it means we crossed a mount * point to a mounted filesystem whose * root vnode is a symlink. In theory * we could continue at this point by * using the pre-crossing searchdir * (e.g. just take out an extra * reference on it before calling * lookup_once so we still have it), * but this will make an ugly mess and * it should never happen in practice * as only badly broken filesystems * have non-directory root vnodes. (I * have seen this sort of thing with * NFS occasionally but even then it * means something's badly wrong.) */ error = ENOTDIR; } else { /* * dholland 20110410: if we're at a * union mount it might make sense to * use the top of the union stack here * rather than the layer we found the * symlink in. (FUTURE) */ error = namei_follow(state, inhibitmagic, searchdir, foundobj, &searchdir); } if (error) { KASSERT(searchdir != foundobj); if (searchdir != NULL) { vrele(searchdir); } vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; return error; } vrele(foundobj); foundobj = NULL; /* * If we followed a symlink to `/' and there * are no more components after the symlink, * we're done with the loop and what we found * is the searchdir. */ if (cnp->cn_nameptr[0] == '\0') { KASSERT(searchdir != NULL); foundobj = searchdir; searchdir = NULL; cnp->cn_flags |= ISLASTCN; break; } continue; } /* * Not a symbolic link. * * Check for directory, if the component was * followed by a series of slashes. */ if ((foundobj->v_type != VDIR) && (cnp->cn_flags & REQUIREDIR)) { KASSERT(foundobj != searchdir); if (searchdir) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } else { KASSERT(!searchdir_locked); } vrele(foundobj); ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; return ENOTDIR; } /* * Stop if we've reached the last component. */ if (cnp->cn_flags & ISLASTCN) { break; } /* * Continue with the next component. */ cnp->cn_nameptr = ndp->ni_next; if (searchdir != NULL) { if (searchdir_locked) { searchdir_locked = false; vput(searchdir); } else { vrele(searchdir); } } searchdir = foundobj; foundobj = NULL; } KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL || VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE); skiploop: if (foundobj != NULL) { if (foundobj == ndp->ni_erootdir) { /* * We are about to return the emulation root. * This isn't a good idea because code might * repeatedly lookup ".." until the file * matches that returned for "/" and loop * forever. So convert it to the real root. */ if (searchdir != NULL) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = ndp->ni_rootdir; vref(foundobj); } /* * If the caller requested the parent node (i.e. it's * a CREATE, DELETE, or RENAME), and we don't have one * (because this is the root directory, or we crossed * a mount point), then we must fail. * * 20210604 dholland when NONEXCLHACK is set (open * with O_CREAT but not O_EXCL) skip this logic. Since * we have a foundobj, open will not be creating, so * it doesn't actually need or use the searchdir, so * it's ok to return it even if it's on a different * volume, and it's also ok to return NULL; by setting * NONEXCLHACK the open code promises to cope with * those cases correctly. (That is, it should do what * it would do anyway, that is, just release the * searchdir, except not crash if it's null.) This is * needed because otherwise opening mountpoints with * O_CREAT but not O_EXCL fails... which is a silly * thing to do but ought to work. (This whole issue * came to light because 3rd party code wanted to open * certain procfs nodes with O_CREAT for some 3rd * party reason, and it failed.) * * Note that NONEXCLHACK is properly a different * nameiop (it is partway between LOOKUP and CREATE) * but it was stuffed in as a flag instead to make the * resulting patch less invasive for pullup. Blah. */ if (cnp->cn_nameiop != LOOKUP && (searchdir == NULL || searchdir->v_mount != foundobj->v_mount) && (cnp->cn_flags & NONEXCLHACK) == 0) { if (searchdir) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; switch (cnp->cn_nameiop) { case CREATE: return EEXIST; case DELETE: case RENAME: return EBUSY; default: break; } panic("Invalid nameiop\n"); } /* * Disallow directory write attempts on read-only lookups. * Prefers EEXIST over EROFS for the CREATE case. */ if (state->rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (searchdir) { if (searchdir_locked) { vput(searchdir); searchdir_locked = false; } else { vrele(searchdir); } searchdir = NULL; } vrele(foundobj); foundobj = NULL; ndp->ni_dvp = NULL; ndp->ni_vp = NULL; state->attempt_retry = 1; return EROFS; } /* Lock the leaf node if requested. */ if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT && searchdir == foundobj) { /* * Note: if LOCKPARENT but not LOCKLEAF is * set, and searchdir == foundobj, this code * necessarily unlocks the parent as well as * the leaf. That is, just because you specify * LOCKPARENT doesn't mean you necessarily get * a locked parent vnode. The code in * vfs_syscalls.c, and possibly elsewhere, * that uses this combination "knows" this, so * it can't be safely changed. Feh. XXX */ KASSERT(searchdir_locked); VOP_UNLOCK(searchdir); searchdir_locked = false; } else if ((cnp->cn_flags & LOCKLEAF) != 0 && (searchdir != foundobj || (cnp->cn_flags & LOCKPARENT) == 0)) { const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ? LK_SHARED : LK_EXCLUSIVE; vn_lock(foundobj, lktype | LK_RETRY); } } /* * Done. */ /* * If LOCKPARENT is not set, the parent directory isn't returned. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { vrele(searchdir); searchdir = NULL; } ndp->ni_dvp = searchdir; ndp->ni_vp = foundobj; return 0; } /* * Do namei; wrapper layer that handles TRYEMULROOT. */ static int namei_tryemulroot(struct namei_state *state, int neverfollow, int inhibitmagic, int isnfsd) { int error; struct nameidata *ndp = state->ndp; struct componentname *cnp = state->cnp; const char *savepath = NULL; KASSERT(cnp == &ndp->ni_cnd); if (cnp->cn_flags & TRYEMULROOT) { savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf); } emul_retry: state->attempt_retry = 0; error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd); if (error) { /* * Once namei has started up, the existence of ni_erootdir * tells us whether we're working from an emulation root. * The TRYEMULROOT flag isn't necessarily authoritative. */ if (ndp->ni_erootdir != NULL && state->attempt_retry) { /* Retry the whole thing using the normal root */ cnp->cn_flags &= ~TRYEMULROOT; state->attempt_retry = 0; /* kinda gross */ strcpy(ndp->ni_pathbuf->pb_path, savepath); pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); savepath = NULL; goto emul_retry; } } if (savepath != NULL) { pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath); } return error; } /* * External interface. */ int namei(struct nameidata *ndp) { struct namei_state state; int error; namei_init(&state, ndp); error = namei_tryemulroot(&state, 0/*!neverfollow*/, 0/*!inhibitmagic*/, 0/*isnfsd*/); namei_cleanup(&state); if (error) { /* make sure no stray refs leak out */ KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL); } return error; } //////////////////////////////////////////////////////////// /* * External interface used by nfsd. This is basically different from * namei only in that it has the ability to pass in the "current * directory", and uses an extra flag "neverfollow" for which there's * no physical flag defined in namei.h. (There used to be a cut&paste * copy of about half of namei in nfsd to allow these minor * adjustments to exist.) * * XXX: the namei interface should be adjusted so nfsd can just use * ordinary namei(). */ int lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow) { struct namei_state state; int error; KASSERT(ndp->ni_atdir == NULL); ndp->ni_atdir = forcecwd; namei_init(&state, ndp); error = namei_tryemulroot(&state, neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/); namei_cleanup(&state); if (error) { /* make sure no stray refs leak out */ KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL); } return error; } /* * A second external interface used by nfsd. This turns out to be a * single lookup used by the WebNFS code (ha!) to get "index.html" or * equivalent when asked for a directory. It should eventually evolve * into some kind of namei_once() call; for the time being it's kind * of a mess. XXX. * * dholland 20110109: I don't think it works, and I don't think it * worked before I started hacking and slashing either, and I doubt * anyone will ever notice. */ /* * Internals. This calls lookup_once() after setting up the assorted * pieces of state the way they ought to be. */ static int do_lookup_for_nfsd_index(struct namei_state *state) { int error; struct componentname *cnp = state->cnp; struct nameidata *ndp = state->ndp; struct vnode *startdir; struct vnode *foundobj; bool startdir_locked; const char *cp; /* pointer into pathname argument */ KASSERT(cnp == &ndp->ni_cnd); startdir = state->ndp->ni_atdir; cnp->cn_nameptr = ndp->ni_pnbuf; state->docache = 1; state->rdonly = cnp->cn_flags & RDONLY; ndp->ni_dvp = NULL; error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen); if (error) { return error; } cp = cnp->cn_nameptr + cnp->cn_namelen; KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX); ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; state->slashes = 0; cnp->cn_flags &= ~REQUIREDIR; cnp->cn_flags |= MAKEENTRY|ISLASTCN; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; /* * Because lookup_once can change the startdir, we need our * own reference to it to avoid consuming the caller's. */ vref(startdir); error = lookup_once(state, startdir, &startdir, &foundobj, &startdir_locked); KASSERT((cnp->cn_flags & LOCKPARENT) == 0); if (startdir_locked) { VOP_UNLOCK(startdir); startdir_locked = false; } /* * If the vnode we found is mounted on, then cross the mount and get * the root vnode in foundobj. If this encounters an error, it will * dispose of foundobj, but searchdir is untouched. */ if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0) { error = lookup_crossmount(state, &startdir, &foundobj, &startdir_locked); } /* Now toss startdir and see if we have an error. */ if (startdir != NULL) vrele(startdir); if (error) foundobj = NULL; else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0) vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY); ndp->ni_vp = foundobj; return (error); } /* * External interface. The partitioning between this function and the * above isn't very clear - the above function exists mostly so code * that uses "state->" can be shuffled around without having to change * it to "state.". */ int lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir) { struct namei_state state; int error; KASSERT(ndp->ni_atdir == NULL); ndp->ni_atdir = startdir; /* * Note: the name sent in here (is not|should not be) allowed * to contain a slash. */ if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) { return ENAMETOOLONG; } if (strchr(ndp->ni_pathbuf->pb_path, '/')) { return EINVAL; } ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1; ndp->ni_pnbuf = NULL; ndp->ni_cnd.cn_nameptr = NULL; namei_init(&state, ndp); error = do_lookup_for_nfsd_index(&state); namei_cleanup(&state); return error; } //////////////////////////////////////////////////////////// /* * Reacquire a path name component. * dvp is locked on entry and exit. * *vpp is locked on exit unless it's NULL. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy) { int rdonly; /* lookup read-only flag bit */ int error = 0; #ifdef DEBUG size_t newlen; /* DEBUG: check name len */ const char *cp; /* DEBUG: check name ptr */ #endif /* DEBUG */ (void)dummy; /* * Setup: break out flag bits into variables. */ rdonly = cnp->cn_flags & RDONLY; /* * Search a new directory. * * The cn_hash value is for use by vfs_cache. * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef DEBUG #if 0 cp = NULL; newhash = namei_hash(cnp->cn_nameptr, &cp); if ((uint32_t)newhash != (uint32_t)cnp->cn_hash) panic("relookup: bad hash"); #endif error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen); if (error) { panic("relookup: parsepath failed with error %d", error); } if (cnp->cn_namelen != newlen) panic("relookup: bad len"); cp = cnp->cn_nameptr + cnp->cn_namelen; while (*cp == '/') cp++; if (*cp != 0) panic("relookup: not last component"); #endif /* DEBUG */ /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') panic("relookup: null name"); if (cnp->cn_flags & ISDOTDOT) panic("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ *vpp = NULL; error = VOP_LOOKUP(dvp, vpp, cnp); if ((error) != 0) { KASSERTMSG((*vpp == NULL), "leaf `%s' should be empty but is %p", cnp->cn_nameptr, *vpp); if (error != EJUSTRETURN) goto bad; } /* * Check for symbolic link */ KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK || (cnp->cn_flags & FOLLOW) == 0), "relookup: symlink found"); /* * Check for read-only lookups. */ if (rdonly && cnp->cn_nameiop != LOOKUP) { error = EROFS; if (*vpp) { vrele(*vpp); } goto bad; } /* * Lock result. */ if (*vpp && *vpp != dvp) { error = vn_lock(*vpp, LK_EXCLUSIVE); if (error != 0) { vrele(*vpp); goto bad; } } return (0); bad: *vpp = NULL; return (error); } /* * namei_simple - simple forms of namei. * * These are wrappers to allow the simple case callers of namei to be * left alone while everything else changes under them. */ /* Flags */ struct namei_simple_flags_type { int dummy; }; static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft; const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn; const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt; const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn; const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft; static int namei_simple_convert_flags(namei_simple_flags_t sflags) { if (sflags == NSM_NOFOLLOW_NOEMULROOT) return NOFOLLOW | 0; if (sflags == NSM_NOFOLLOW_TRYEMULROOT) return NOFOLLOW | TRYEMULROOT; if (sflags == NSM_FOLLOW_NOEMULROOT) return FOLLOW | 0; if (sflags == NSM_FOLLOW_TRYEMULROOT) return FOLLOW | TRYEMULROOT; panic("namei_simple_convert_flags: bogus sflags\n"); return 0; } int namei_simple_kernel(const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { return nameiat_simple_kernel(NULL, path, sflags, vp_ret); } int nameiat_simple_kernel(struct vnode *dvp, const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { struct nameidata nd; struct pathbuf *pb; int err; pb = pathbuf_create(path); if (pb == NULL) { return ENOMEM; } NDINIT(&nd, LOOKUP, namei_simple_convert_flags(sflags), pb); if (dvp != NULL) NDAT(&nd, dvp); err = namei(&nd); if (err != 0) { pathbuf_destroy(pb); return err; } *vp_ret = nd.ni_vp; pathbuf_destroy(pb); return 0; } int namei_simple_user(const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { return nameiat_simple_user(NULL, path, sflags, vp_ret); } int nameiat_simple_user(struct vnode *dvp, const char *path, namei_simple_flags_t sflags, struct vnode **vp_ret) { struct pathbuf *pb; struct nameidata nd; int err; err = pathbuf_copyin(path, &pb); if (err) { return err; } NDINIT(&nd, LOOKUP, namei_simple_convert_flags(sflags), pb); if (dvp != NULL) NDAT(&nd, dvp); err = namei(&nd); if (err != 0) { pathbuf_destroy(pb); return err; } *vp_ret = nd.ni_vp; pathbuf_destroy(pb); return 0; }
87 247 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* $NetBSD: pmap_private.h,v 1.5 2023/10/04 20:28:06 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Frank van der Linden for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _X86_PMAP_PRIVATE_H_ #define _X86_PMAP_PRIVATE_H_ #ifndef _MACHINE_PMAP_PRIVATE_H_X86 #error Include machine/pmap_private.h, not x86/pmap_private.h. #endif #ifdef _KERNEL_OPT #include "opt_svs.h" #endif #include <sys/param.h> #include <sys/types.h> #include <sys/kcpuset.h> #include <sys/mutex.h> #include <sys/pool.h> #include <sys/queue.h> #include <sys/rwlock.h> #include <machine/cpufunc.h> #include <machine/pte.h> #include <machine/vmparam.h> #include <uvm/uvm_object.h> #include <uvm/uvm_pmap.h> struct pmap; #define SLAREA_USER 0 #define SLAREA_PTE 1 #define SLAREA_MAIN 2 #define SLAREA_PCPU 3 #define SLAREA_DMAP 4 #define SLAREA_HYPV 5 #define SLAREA_ASAN 6 #define SLAREA_MSAN 7 #define SLAREA_KERN 8 #define SLSPACE_NAREAS 9 struct slotspace { struct { size_t sslot; /* start slot */ size_t nslot; /* # of slots */ bool active; /* area is active */ } area[SLSPACE_NAREAS]; }; extern struct slotspace slotspace; #include <x86/gdt.h> struct pcpu_entry { uint8_t gdt[MAXGDTSIZ]; uint8_t ldt[MAX_USERLDT_SIZE]; uint8_t idt[PAGE_SIZE]; uint8_t tss[PAGE_SIZE]; uint8_t ist0[PAGE_SIZE]; uint8_t ist1[PAGE_SIZE]; uint8_t ist2[PAGE_SIZE]; uint8_t ist3[PAGE_SIZE]; uint8_t rsp0[2 * PAGE_SIZE]; } __packed; struct pcpu_area { #ifdef SVS uint8_t utls[PAGE_SIZE]; #endif uint8_t ldt[PAGE_SIZE]; struct pcpu_entry ent[MAXCPUS]; } __packed; extern struct pcpu_area *pcpuarea; #define PMAP_PCID_KERN 0 #define PMAP_PCID_USER 1 /* * pmap data structures: see pmap.c for details of locking. */ /* * we maintain a list of all non-kernel pmaps */ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */ /* * linked list of all non-kernel pmaps */ extern struct pmap_head pmaps; extern kmutex_t pmaps_lock; /* protects pmaps */ /* * pool_cache(9) that pmaps are allocated from */ extern struct pool_cache pmap_cache; /* * the pmap structure * * note that the pm_obj contains the lock pointer, the reference count, * page list, and number of PTPs within the pmap. * * pm_lock is the same as the lock for vm object 0. Changes to * the other objects may only be made if that lock has been taken * (the other object locks are only used when uvm_pagealloc is called) */ struct pv_page; struct pmap { struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list of all pmaps */ pd_entry_t *pm_pdir; /* VA of PD */ paddr_t pm_pdirpa[PDP_SIZE]; /* PA of PDs (read-only after create) */ struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats */ struct pv_entry *pm_pve; /* spare pv_entry */ LIST_HEAD(, pv_page) pm_pvp_part; LIST_HEAD(, pv_page) pm_pvp_empty; LIST_HEAD(, pv_page) pm_pvp_full; #if !defined(__x86_64__) vaddr_t pm_hiexec; /* highest executable mapping */ #endif /* !defined(__x86_64__) */ union descriptor *pm_ldt; /* user-set LDT */ size_t pm_ldt_len; /* XXX unused, remove */ int pm_ldt_sel; /* LDT selector */ kcpuset_t *pm_cpus; /* mask of CPUs using pmap */ kcpuset_t *pm_kernel_cpus; /* mask of CPUs using kernel part of pmap */ kcpuset_t *pm_xen_ptp_cpus; /* mask of CPUs which have this pmap's ptp mapped */ long pm_pctr; /* for assertions */ LIST_HEAD(,vm_page) pm_gc_ptp; /* PTPs queued for free */ /* Used by NVMM and Xen */ int (*pm_enter)(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int); bool (*pm_extract)(struct pmap *, vaddr_t, paddr_t *); void (*pm_remove)(struct pmap *, vaddr_t, vaddr_t); int (*pm_sync_pv)(struct vm_page *, vaddr_t, paddr_t, int, uint8_t *, pt_entry_t *); void (*pm_pp_remove_ent)(struct pmap *, struct vm_page *, pt_entry_t, vaddr_t); void (*pm_write_protect)(struct pmap *, vaddr_t, vaddr_t, vm_prot_t); void (*pm_unwire)(struct pmap *, vaddr_t); void (*pm_tlb_flush)(struct pmap *); void *pm_data; kmutex_t pm_lock /* locks for pm_objs */ __aligned(64); /* give lock own cache line */ krwlock_t pm_dummy_lock; /* ugly hack for abusing uvm_object */ }; /* macro to access pm_pdirpa slots */ #ifdef PAE #define pmap_pdirpa(pmap, index) \ ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t)) #else #define pmap_pdirpa(pmap, index) \ ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t)) #endif /* * global kernel variables */ /* * PDPpaddr is the physical address of the kernel's PDP. * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3 * value associated to the kernel process, proc0. * - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to * the L3 PD, it cannot be considered as the equivalent of a %cr3 any more. * - Xen: it corresponds to the PFN of the kernel's PDP. */ extern u_long PDPpaddr; extern pd_entry_t pmap_pg_g; /* do we support PTE_G? */ extern pd_entry_t pmap_pg_nx; /* do we support PTE_NX? */ extern int pmap_largepages; extern long nkptp[PTP_LEVELS]; #define pmap_valid_entry(E) ((E) & PTE_P) /* is PDE or PTE valid? */ void pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **, pd_entry_t * const **); void pmap_unmap_ptes(struct pmap *, struct pmap *); bool pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *, int *lastlvl); bool pmap_is_curpmap(struct pmap *); void pmap_ept_transform(struct pmap *); #ifndef __HAVE_DIRECT_MAP void pmap_vpage_cpu_init(struct cpu_info *); #endif vaddr_t slotspace_rand(int, size_t, size_t, size_t, vaddr_t); vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ typedef enum tlbwhy { TLBSHOOT_REMOVE_ALL, TLBSHOOT_KENTER, TLBSHOOT_KREMOVE, TLBSHOOT_FREE_PTP, TLBSHOOT_REMOVE_PTE, TLBSHOOT_SYNC_PV, TLBSHOOT_WRITE_PROTECT, TLBSHOOT_ENTER, TLBSHOOT_NVMM, TLBSHOOT_BUS_DMA, TLBSHOOT_BUS_SPACE, TLBSHOOT__MAX, } tlbwhy_t; void pmap_tlb_init(void); void pmap_tlb_cpu_init(struct cpu_info *); void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t); void pmap_tlb_shootnow(void); void pmap_tlb_intr(void); /* * inline functions */ /* * pmap_update_pg: flush one page from the TLB (or flush the whole thing * if hardware doesn't support one-page flushing) */ __inline static void __unused pmap_update_pg(vaddr_t va) { invlpg(va); } /* * various address inlines * * vtopte: return a pointer to the PTE mapping a VA, works only for * user and PT addresses * * kvtopte: return a pointer to the PTE mapping a kernel VA */ #include <lib/libkern/libkern.h> static __inline pt_entry_t * __unused vtopte(vaddr_t va) { KASSERT(va < VM_MIN_KERNEL_ADDRESS); return (PTE_BASE + pl1_i(va)); } static __inline pt_entry_t * __unused kvtopte(vaddr_t va) { pd_entry_t *pde; KASSERT(va >= VM_MIN_KERNEL_ADDRESS); pde = L2_BASE + pl2_i(va); if (*pde & PTE_PS) return ((pt_entry_t *)pde); return (PTE_BASE + pl1_i(va)); } #ifdef XENPV #include <sys/bitops.h> #define XPTE_MASK L1_FRAME /* Selects the index of a PTE in (A)PTE_BASE */ #define XPTE_SHIFT (L1_SHIFT - ilog2(sizeof(pt_entry_t))) /* PTE access inline functions */ /* * Get the machine address of the pointed pte * We use hardware MMU to get value so works only for levels 1-3 */ static __inline paddr_t xpmap_ptetomach(pt_entry_t *pte) { pt_entry_t *up_pte; vaddr_t va = (vaddr_t) pte; va = ((va & XPTE_MASK) >> XPTE_SHIFT) | (vaddr_t) PTE_BASE; up_pte = (pt_entry_t *) va; return (paddr_t) (((*up_pte) & PTE_FRAME) + (((vaddr_t) pte) & (~PTE_FRAME & ~VA_SIGN_MASK))); } /* Xen helpers to change bits of a pte */ #define XPMAP_UPDATE_DIRECT 1 /* Update direct map entry flags too */ paddr_t vtomach(vaddr_t); #define vtomfn(va) (vtomach(va) >> PAGE_SHIFT) #endif /* XENPV */ #ifdef __HAVE_PCPU_AREA extern struct pcpu_area *pcpuarea; #define PDIR_SLOT_PCPU 510 #define PMAP_PCPU_BASE (VA_SIGN_NEG((PDIR_SLOT_PCPU * NBPD_L4))) #endif void svs_quad_copy(void *, void *, long); #ifdef _KERNEL_OPT #include "opt_efi.h" #endif #ifdef EFI_RUNTIME void * pmap_activate_sync(struct pmap *); void pmap_deactivate_sync(struct pmap *, void *); bool pmap_is_user(struct pmap *); #else static inline bool pmap_is_user(struct pmap *pmap) { KASSERT(pmap != pmap_kernel()); return true; } #endif #endif /* _X86_PMAP_PRIVATE_H_ */
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* $NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $ */ /*- * Copyright (c) 2000, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Christopher G. Demetriou. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $"); #include <sys/param.h> #include <sys/time.h> /* * ratecheck(): simple time-based rate-limit checking. see ratecheck(9) * for usage and rationale. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timercmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } return (rv); } /* * ppsratecheck(): packets (or events) per second limitation. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { struct timeval tv, delta; int rv; getmicrouptime(&tv); timersub(&tv, lasttime, &delta); /* * check for 0,0 is so that the message will be seen at least once. * if more than one second have passed since the last update of * lasttime, reset the counter. * * we do increment *curpps even in *curpps < maxpps case, as some may * try to use *curpps for stat purposes as well. */ if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) || delta.tv_sec >= 1) { *lasttime = tv; *curpps = 0; } if (maxpps < 0) rv = 1; else if (*curpps < maxpps) rv = 1; else rv = 0; #if 1 /*DIAGNOSTIC?*/ /* be careful about wrap-around */ if (__predict_true(*curpps != INT_MAX)) *curpps = *curpps + 1; #else /* * assume that there's not too many calls to this function. * not sure if the assumption holds, as it depends on *caller's* * behavior, not the behavior of this function. * IMHO it is wrong to make assumption on the caller's behavior, * so the above #if is #if 1, not #ifdef DIAGNOSTIC. */ *curpps = *curpps + 1; #endif return (rv); }
36 36 1 527 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* $NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $ */ /*- * Copyright (c) 2006 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #define _LWP_API_PRIVATE #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $"); #include <sys/param.h> #include <sys/lwp.h> #include <sys/specificdata.h> static specificdata_domain_t lwp_specificdata_domain; void lwpinit_specificdata(void) { lwp_specificdata_domain = specificdata_domain_create(); KASSERT(lwp_specificdata_domain != NULL); } /* * lwp_specific_key_create -- * Create a key for subsystem lwp-specific data. */ int lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) { return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor)); } /* * lwp_specific_key_delete -- * Delete a key for subsystem lwp-specific data. */ void lwp_specific_key_delete(specificdata_key_t key) { specificdata_key_delete(lwp_specificdata_domain, key); } /* * lwp_initspecific -- * Initialize an LWP's specificdata container. */ void lwp_initspecific(struct lwp *l) { int error __diagused; error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref); KASSERT(error == 0); } /* * lwp_finispecific -- * Finalize an LWP's specificdata container. */ void lwp_finispecific(struct lwp *l) { specificdata_fini(lwp_specificdata_domain, &l->l_specdataref); } /* * lwp_getspecific -- * Return lwp-specific data corresponding to the specified key. * * Note: LWP specific data is NOT INTERLOCKED. An LWP should access * only its OWN SPECIFIC DATA. If it is necessary to access another * LWP's specifc data, care must be taken to ensure that doing so * would not cause internal data structure inconsistency (i.e. caller * can guarantee that the target LWP is not inside an lwp_getspecific() * or lwp_setspecific() call). */ void * lwp_getspecific(specificdata_key_t key) { return (specificdata_getspecific_unlocked(lwp_specificdata_domain, &curlwp->l_specdataref, key)); } void * _lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key) { return (specificdata_getspecific_unlocked(lwp_specificdata_domain, &l->l_specdataref, key)); } /* * lwp_setspecific -- * Set lwp-specific data corresponding to the specified key. */ void lwp_setspecific(specificdata_key_t key, void *data) { specificdata_setspecific(lwp_specificdata_domain, &curlwp->l_specdataref, key, data); } void lwp_setspecific_by_lwp(struct lwp *l, specificdata_key_t key, void *data) { specificdata_setspecific(lwp_specificdata_domain, &l->l_specdataref, key, data); }
135 17 120 32 33 7 27 71 17 54 69 2 2 2 2 2 2 2 2 2 392 394 392 9 9 362 364 146 145 5 5 362 364 144 144 5 5 3 3 4 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $ */ /* * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * uvm_object.c: operate with memory objects * * TODO: * 1. Support PG_RELEASED-using objects */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" #endif #include <sys/param.h> #include <sys/rwlock.h> #include <sys/queue.h> #include <uvm/uvm.h> #include <uvm/uvm_ddb.h> #include <uvm/uvm_page_array.h> /* Page count to fetch per single step. */ #define FETCH_PAGECOUNT 16 /* * uvm_obj_init: initialize UVM memory object. */ void uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops, bool alock, u_int refs) { #if 0 /* notyet */ KASSERT(ops); #endif if (alock) { /* Allocate and assign a lock. */ uo->vmobjlock = rw_obj_alloc(); } else { /* The lock will need to be set via uvm_obj_setlock(). */ uo->vmobjlock = NULL; } uo->pgops = ops; LIST_INIT(&uo->uo_ubc); uo->uo_npages = 0; uo->uo_refs = refs; radix_tree_init_tree(&uo->uo_pages); } /* * uvm_obj_destroy: destroy UVM memory object. */ void uvm_obj_destroy(struct uvm_object *uo, bool dlock) { KASSERT(radix_tree_empty_tree_p(&uo->uo_pages)); /* Purge any UBC entries associated with this object. */ ubc_purge(uo); /* Destroy the lock, if requested. */ if (dlock) { rw_obj_free(uo->vmobjlock); } radix_tree_fini_tree(&uo->uo_pages); } /* * uvm_obj_setlock: assign a vmobjlock to the UVM object. * * => Caller is responsible to ensure that UVM objects is not use. * => Only dynamic lock may be previously set. We drop the reference then. */ void uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr) { krwlock_t *olockptr = uo->vmobjlock; if (olockptr) { /* Drop the reference on the old lock. */ rw_obj_free(olockptr); } if (lockptr == NULL) { /* If new lock is not passed - allocate default one. */ lockptr = rw_obj_alloc(); } uo->vmobjlock = lockptr; } /* * uvm_obj_wirepages: wire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set (aobj objects) * => caller must pass page-aligned start and end values */ int uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end, struct pglist *list) { int i, npages, error; struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL; off_t offset = start, left; left = (end - start) >> PAGE_SHIFT; rw_enter(uobj->vmobjlock, RW_WRITER); while (left) { npages = MIN(FETCH_PAGECOUNT, left); /* Get the pages */ memset(pgs, 0, sizeof(pgs)); error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0, VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL, PGO_SYNCIO); if (error) goto error; rw_enter(uobj->vmobjlock, RW_WRITER); for (i = 0; i < npages; i++) { KASSERT(pgs[i] != NULL); KASSERT(!(pgs[i]->flags & PG_RELEASED)); /* * Loan break */ if (pgs[i]->loan_count) { while (pgs[i]->loan_count) { pg = uvm_loanbreak(pgs[i]); if (!pg) { rw_exit(uobj->vmobjlock); uvm_wait("uobjwirepg"); rw_enter(uobj->vmobjlock, RW_WRITER); continue; } } pgs[i] = pg; } if (pgs[i]->flags & PG_AOBJ) { uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY); uao_dropswap(uobj, i); } } /* Wire the pages */ for (i = 0; i < npages; i++) { uvm_pagelock(pgs[i]); uvm_pagewire(pgs[i]); uvm_pageunlock(pgs[i]); if (list != NULL) TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue); } /* Unbusy the pages */ uvm_page_unbusy(pgs, npages); left -= npages; offset += npages << PAGE_SHIFT; } rw_exit(uobj->vmobjlock); return 0; error: /* Unwire the pages which has been wired */ uvm_obj_unwirepages(uobj, start, offset); return error; } /* * uvm_obj_unwirepages: unwire the pages of entire UVM object. * * => NOTE: this function should only be used for types of objects * where PG_RELEASED flag is never set * => caller must pass page-aligned start and end values */ void uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end) { struct vm_page *pg; off_t offset; rw_enter(uobj->vmobjlock, RW_WRITER); for (offset = start; offset < end; offset += PAGE_SIZE) { pg = uvm_pagelookup(uobj, offset); KASSERT(pg != NULL); KASSERT(!(pg->flags & PG_RELEASED)); uvm_pagelock(pg); uvm_pageunwire(pg); uvm_pageunlock(pg); } rw_exit(uobj->vmobjlock); } static inline bool uvm_obj_notag_p(struct uvm_object *uobj, int tag) { KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag); } bool uvm_obj_clean_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_nowriteback_p(struct uvm_object *uobj) { return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG); } static inline bool uvm_obj_page_tag_p(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_lock_held(uobj->vmobjlock)); return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0; } static inline void uvm_obj_page_set_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_set_tag(&uobj->uo_pages, pgidx, tag); } static inline void uvm_obj_page_clear_tag(struct vm_page *pg, int tag) { struct uvm_object *uobj = pg->uobject; uint64_t pgidx = pg->offset >> PAGE_SHIFT; KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock)); radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag); } bool uvm_obj_page_dirty_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_set_dirty(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG); } void uvm_obj_page_clear_dirty(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG); } bool uvm_obj_page_writeback_p(struct vm_page *pg) { return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_set_writeback(struct vm_page *pg) { uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG); } void uvm_obj_page_clear_writeback(struct vm_page *pg) { uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG); } #if defined(DDB) || defined(DEBUGPRINT) /* * uvm_object_printit: actually prints the object */ void uvm_object_printit(struct uvm_object *uobj, bool full, void (*pr)(const char *, ...)) { struct uvm_page_array a; struct vm_page *pg; int cnt = 0; voff_t off; (*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ", uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages); if (UVM_OBJ_IS_KERN_OBJECT(uobj)) (*pr)("refs=<SYSTEM>\n"); else (*pr)("refs=%d\n", uobj->uo_refs); if (!full) { return; } (*pr)(" PAGES <pg,offset>:\n "); uvm_page_array_init(&a, uobj, 0); off = 0; while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) { cnt++; (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 0) { (*pr)("\n "); } off = pg->offset + PAGE_SIZE; uvm_page_array_advance(&a); } if ((cnt % 3) != 0) { (*pr)("\n"); } uvm_page_array_fini(&a); } #endif /* DDB || DEBUGPRINT */
3 3 3 3 3 3 3 3 2 2 1 1 1 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 /* $NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $ */ /* $KAME: nd6.c,v 1.279 2002/06/08 11:16:51 itojun Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $"); #ifdef _KERNEL_OPT #include "opt_compat_netbsd.h" #include "opt_net_mpsafe.h" #endif #include "bridge.h" #include "carp.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kmem.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/errno.h> #include <sys/ioctl.h> #include <sys/syslog.h> #include <sys/queue.h> #include <sys/cprng.h> #include <sys/workqueue.h> #include <sys/compat_stub.h> #include <net/if.h> #include <net/if_dl.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/nd.h> #include <net/route.h> #include <net/if_ether.h> #include <net/if_arc.h> #include <netinet/in.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #include <netinet6/in6_ifattach.h> #include <netinet/icmp6.h> #include <netinet6/icmp6_private.h> #include <compat/netinet6/in6_var.h> #include <compat/netinet6/nd6.h> #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ /* timer values */ int nd6_prune = 1; /* walk list every 1 seconds */ int nd6_useloopback = 1; /* use loopback interface for local traffic */ /* preventing too many loops in ND option parsing */ int nd6_maxndopt = 10; /* max # of ND options allowed */ #ifdef ND6_DEBUG int nd6_debug = 1; #else int nd6_debug = 0; #endif krwlock_t nd6_lock __cacheline_aligned; int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; static void nd6_slowtimo(void *); static void nd6_free(struct llentry *, int); static bool nd6_nud_enabled(struct ifnet *); static unsigned int nd6_llinfo_reachable(struct ifnet *); static unsigned int nd6_llinfo_retrans(struct ifnet *); static union l3addr *nd6_llinfo_holdsrc(struct llentry *, union l3addr *); static void nd6_llinfo_output(struct ifnet *, const union l3addr *, const union l3addr *, const uint8_t *, const union l3addr *); static void nd6_llinfo_missed(struct ifnet *, const union l3addr *, int16_t, struct mbuf *); static void nd6_timer(void *); static void nd6_timer_work(struct work *, void *); static struct nd_opt_hdr *nd6_option(union nd_opts *); static callout_t nd6_slowtimo_ch; static callout_t nd6_timer_ch; static struct workqueue *nd6_timer_wq; static struct work nd6_timer_wk; struct nd_domain nd6_nd_domain = { .nd_family = AF_INET6, .nd_delay = 5, /* delay first probe time 5 second */ .nd_mmaxtries = 3, /* maximum unicast query */ .nd_umaxtries = 3, /* maximum multicast query */ .nd_retransmultiple = BACKOFF_MULTIPLE, .nd_maxretrans = MAX_RETRANS_TIMER, .nd_maxnudhint = 0, /* max # of subsequent upper layer hints */ .nd_maxqueuelen = 1, /* max # of packets in unresolved ND entries */ .nd_nud_enabled = nd6_nud_enabled, .nd_reachable = nd6_llinfo_reachable, .nd_retrans = nd6_llinfo_retrans, .nd_holdsrc = nd6_llinfo_holdsrc, .nd_output = nd6_llinfo_output, .nd_missed = nd6_llinfo_missed, .nd_free = nd6_free, }; MALLOC_DEFINE(M_IP6NDP, "NDP", "IPv6 Neighbour Discovery"); void nd6_init(void) { int error; nd_attach_domain(&nd6_nd_domain); nd6_nbr_init(); rw_init(&nd6_lock); callout_init(&nd6_slowtimo_ch, CALLOUT_MPSAFE); callout_init(&nd6_timer_ch, CALLOUT_MPSAFE); error = workqueue_create(&nd6_timer_wq, "nd6_timer", nd6_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE); if (error) panic("%s: workqueue_create failed (%d)\n", __func__, error); /* start timer */ callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); } struct nd_kifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_kifinfo *nd; nd = kmem_zalloc(sizeof(*nd), KM_SLEEP); nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * A bridge interface should not have ND6_IFF_AUTO_LINKLOCAL * because one of its members should. */ if ((ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; return nd; } void nd6_ifdetach(struct ifnet *ifp, struct in6_ifextra *ext) { /* Ensure all IPv6 addresses are purged before calling nd6_purge */ if_purgeaddrs(ifp, AF_INET6, in6_purgeaddr); nd6_purge(ifp, ext); kmem_free(ext->nd_ifinfo, sizeof(struct nd_kifinfo)); } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { memset(ndopts, 0, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ static struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL); KASSERT(ndopts->nd_opts_last != NULL); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((void *)&nd_opt->nd_opt_len >= (void *)ndopts->nd_opts_last) { memset(ndopts, 0, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ memset(ndopts, 0, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((char *)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ memset(ndopts, 0, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL); KASSERT(ndopts->nd_opts_last != NULL); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6_STATINC(ICMP6_STAT_ND_BADOPT); memset(ndopts, 0, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log(LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log(LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type); } skip1: i++; if (i > nd6_maxndopt) { ICMP6_STATINC(ICMP6_STAT_ND_TOOMANYOPT); nd6log(LOG_INFO, "too many loop in nd opt\n"); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. */ static struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr *hip6; if (ln == NULL || ln->ln_hold == NULL) return NULL; /* * assuming every packet in ln_hold has the same IP header */ hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*hip6) < ln->ln_hold->m_len) *src = hip6->ip6_src; else src = NULL; return src; } static union l3addr * nd6_llinfo_holdsrc(struct llentry *ln, union l3addr *src) { if (nd6_llinfo_get_holdsrc(ln, &src->addr6) == NULL) return NULL; return src; } static void nd6_llinfo_output(struct ifnet *ifp, const union l3addr *daddr, const union l3addr *taddr, __unused const uint8_t *tlladdr, const union l3addr *hsrc) { nd6_ns_output(ifp, daddr != NULL ? &daddr->addr6 : NULL, taddr != NULL ? &taddr->addr6 : NULL, hsrc != NULL ? &hsrc->addr6 : NULL, NULL); } static bool nd6_nud_enabled(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->flags & ND6_IFF_PERFORMNUD; } static unsigned int nd6_llinfo_reachable(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->reachable; } static unsigned int nd6_llinfo_retrans(struct ifnet *ifp) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); return ndi->retrans; } static void nd6_llinfo_missed(struct ifnet *ifp, const union l3addr *taddr, int16_t type, struct mbuf *m) { struct in6_addr mdaddr6 = zeroin6_addr; struct sockaddr_in6 dsin6, tsin6; struct sockaddr *sa; if (m != NULL) { if (type == ND_LLINFO_PROBE) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*ip6) < m->m_len) mdaddr6 = ip6->ip6_src; m_freem(m); } else icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp, &mdaddr6); } if (!IN6_IS_ADDR_UNSPECIFIED(&mdaddr6)) { sockaddr_in6_init(&dsin6, &mdaddr6, 0, 0, 0); sa = sin6tosa(&dsin6); } else sa = NULL; sockaddr_in6_init(&tsin6, &taddr->addr6, 0, 0, 0); rt_clonedmsg(RTM_MISS, sa, sin6tosa(&tsin6), NULL, ifp); } /* * ND6 timer routine to expire default route list and prefix list */ static void nd6_timer_work(struct work *wk, void *arg) { struct in6_ifaddr *ia6, *nia6; int s, bound; struct psref psref; callout_reset(&nd6_timer_ch, nd6_prune * hz, nd6_timer, NULL); SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); /* expire interface addresses */ bound = curlwp_bind(); s = pserialize_read_enter(); for (ia6 = IN6_ADDRLIST_READER_FIRST(); ia6; ia6 = nia6) { nia6 = IN6_ADDRLIST_READER_NEXT(ia6); ia6_acquire(ia6, &psref); pserialize_read_exit(s); /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { struct ifnet *ifp; ifp = ia6->ia_ifa.ifa_ifp; IFNET_LOCK(ifp); /* * Need to take the lock first to prevent if_detach * from running in6_purgeaddr concurrently. */ if (!if_is_deactivated(ifp)) { ia6_release(ia6, &psref); in6_purgeaddr(&ia6->ia_ifa); } else { /* * ifp is being destroyed, ia6 will be destroyed * by if_detach. */ ia6_release(ia6, &psref); } ia6 = NULL; IFNET_UNLOCK(ifp); } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; if ((oldflags & IN6_IFF_DEPRECATED) == 0) { ia6->ia6_flags |= IN6_IFF_DEPRECATED; rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6); } } else { /* * A new RA might have made a deprecated address * preferred. */ if (ia6->ia6_flags & IN6_IFF_DEPRECATED) { ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6); } } s = pserialize_read_enter(); ia6_release(ia6, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } static void nd6_timer(void *ignored_arg) { workqueue_enqueue(nd6_timer_wq, &nd6_timer_wk, NULL); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp, struct in6_ifextra *ext) { /* * During detach, the ND info might be already removed, but * then is explitly passed as argument. * Otherwise get it from ifp->if_afdata. */ if (ext == NULL) ext = ifp->if_afdata[AF_INET6]; if (ext == NULL) return; /* * We may not need to nuke the neighbor cache entries here * because the neighbor cache is kept in if_afdata[AF_INET6]. * nd6_purge() is invoked by in6_ifdetach() which is called * from if_detach() where everything gets purged. However * in6_ifdetach is directly called from vlan(4), so we still * need to purge entries here. */ if (ext->lltable != NULL) lltable_purge_entries(ext->lltable); } struct llentry * nd6_lookup(const struct in6_addr *addr6, const struct ifnet *ifp, bool wlock) { struct sockaddr_in6 sin6; struct llentry *ln; sockaddr_in6_init(&sin6, addr6, 0, 0, 0); IF_AFDATA_RLOCK(ifp); ln = lla_lookup(LLTABLE6(ifp), wlock ? LLE_EXCLUSIVE : 0, sin6tosa(&sin6)); IF_AFDATA_RUNLOCK(ifp); return ln; } struct llentry * nd6_create(const struct in6_addr *addr6, const struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; struct rtentry *rt; sockaddr_in6_init(&sin6, addr6, 0, 0, 0); rt = rtalloc1(sin6tosa(&sin6), 0); IF_AFDATA_WLOCK(ifp); ln = lla_create(LLTABLE6(ifp), LLE_EXCLUSIVE, sin6tosa(&sin6), rt); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); if (ln != NULL) ln->ln_state = ND_LLINFO_NOSTATE; return ln; } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct ifaddr *dstaddr; int s; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return 0; /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return 0; if (sin6_copy.sin6_scope_id == zone) return 1; else return 0; } /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ s = pserialize_read_enter(); dstaddr = ifa_ifwithdstaddr(sin6tocsa(addr)); if (dstaddr != NULL) { if (dstaddr->ifa_ifp == ifp) { pserialize_read_exit(s); return 1; } } pserialize_read_exit(s); return 0; } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *ln; struct rtentry *rt; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return 0; /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return 0; if (sin6_copy.sin6_scope_id == zone) return 1; else return 0; } if (nd6_is_new_addr_neighbor(addr, ifp)) return 1; /* * Even if the address matches none of our addresses, it might be * in the neighbor cache or a connected route. */ ln = nd6_lookup(&addr->sin6_addr, ifp, false); if (ln != NULL) { LLE_RUNLOCK(ln); return 1; } rt = rtalloc1(sin6tocsa(addr), 0); if (rt == NULL) return 0; if ((rt->rt_flags & RTF_CONNECTED) && (rt->rt_ifp == ifp #if NBRIDGE > 0 || rt->rt_ifp->if_bridge == ifp->if_bridge #endif #if NCARP > 0 || (ifp->if_type == IFT_CARP && rt->rt_ifp == ifp->if_carpdev) || (rt->rt_ifp->if_type == IFT_CARP && rt->rt_ifp->if_carpdev == ifp)|| (ifp->if_type == IFT_CARP && rt->rt_ifp->if_type == IFT_CARP && rt->rt_ifp->if_carpdev == ifp->if_carpdev) #endif )) { rt_unref(rt); return 1; } rt_unref(rt); return 0; } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. */ static void nd6_free(struct llentry *ln, int gc) { struct ifnet *ifp; KASSERT(ln != NULL); LLE_WLOCK_ASSERT(ln); /* * If the reason for the deletion is just garbage collection, * and the neighbor is an active router, do not delete it. * Instead, reset the GC timer using the router's lifetime. * XXX: the check for ln_state should be redundant, * but we intentionally keep it just in case. */ if (!ip6_forwarding && ln->ln_router && ln->ln_state == ND_LLINFO_STALE && gc) { nd_set_timer(ln, ND_TIMER_EXPIRE); LLE_WUNLOCK(ln); return; } ifp = ln->lle_tbl->llt_ifp; if (ln->la_flags & LLE_VALID || gc) { struct sockaddr_in6 sin6; const char *lladdr; sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0); lladdr = ln->la_flags & LLE_VALID ? (const char *)&ln->ll_addr : NULL; rt_clonedmsg(RTM_DELETE, NULL, sin6tosa(&sin6), lladdr, ifp); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); lltable_free_entry(LLTABLE6(ifp), ln); IF_AFDATA_UNLOCK(ifp); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void nd6_nud_hint(struct rtentry *rt) { struct llentry *ln; struct ifnet *ifp; if (rt == NULL) return; ifp = rt->rt_ifp; ln = nd6_lookup(&(satocsin6(rt_getkey(rt)))->sin6_addr, ifp, true); nd_nud_hint(ln); } struct gc_args { int gc_entries; const struct in6_addr *skip_in6; }; static int nd6_purge_entry(struct lltable *llt, struct llentry *ln, void *farg) { struct gc_args *args = farg; int *n = &args->gc_entries; const struct in6_addr *skip_in6 = args->skip_in6; if (*n <= 0) return 0; if (ND_IS_LLINFO_PERMANENT(ln)) return 0; if (IN6_ARE_ADDR_EQUAL(&ln->r_l3addr.addr6, skip_in6)) return 0; LLE_WLOCK(ln); if (ln->ln_state > ND_LLINFO_INCOMPLETE) ln->ln_state = ND_LLINFO_STALE; else ln->ln_state = ND_LLINFO_PURGE; nd_set_timer(ln, ND_TIMER_IMMEDIATE); LLE_WUNLOCK(ln); (*n)--; return 0; } static void nd6_gc_neighbors(struct lltable *llt, const struct in6_addr *in6) { if (ip6_neighborgcthresh >= 0 && lltable_get_entry_count(llt) >= ip6_neighborgcthresh) { struct gc_args gc_args = {10, in6}; /* * XXX entries that are "less recently used" should be * freed first. */ lltable_foreach_lle(llt, nd6_purge_entry, &gc_args); } } void nd6_rtrequest(int req, struct rtentry *rt, const struct rt_addrinfo *info) { struct sockaddr *gate = rt->rt_gateway; struct ifnet *ifp = rt->rt_ifp; uint8_t namelen = strlen(ifp->if_xname), addrlen = ifp->if_addrlen; struct ifaddr *ifa; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if (req == RTM_LLINFO_UPD) { int rc; struct in6_addr *in6; struct in6_addr in6_all; int anycast; if ((ifa = info->rti_ifa) == NULL) return; in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; anycast = ifatoia6(ifa)->ia6_flags & IN6_IFF_ANYCAST; in6_all = in6addr_linklocal_allnodes; if ((rc = in6_setscope(&in6_all, ifa->ifa_ifp, NULL)) != 0) { log(LOG_ERR, "%s: failed to set scope %s " "(errno=%d)\n", __func__, if_name(ifp), rc); return; } /* XXX don't set Override for proxy addresses */ nd6_na_output(ifa->ifa_ifp, &in6_all, in6, (anycast ? 0 : ND_NA_FLAG_OVERRIDE) #if 0 | (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) #endif , 1, NULL); return; } if ((rt->rt_flags & RTF_GATEWAY) != 0) { if (req != RTM_ADD) return; /* * linklayers with particular MTU limitation. */ switch(ifp->if_type) { #if NARCNET > 0 case IFT_ARCNET: if (rt->rt_rmx.rmx_mtu > ARC_PHDS_MAXMTU) /* RFC2497 */ rt->rt_rmx.rmx_mtu = ARC_PHDS_MAXMTU; break; #endif } return; } if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) { RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * This is probably an interface direct route for a link * which does not need neighbor caches (e.g. fe80::%lo0/64). * We do not need special treatment below for such a route. * Moreover, the RTF_LLINFO flag which would be set below * would annoy the ndp(8) command. */ return; } switch (req) { case RTM_ADD: { struct psref psref; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * There is no backward compatibility :) * * if ((rt->rt_flags & RTF_HOST) == 0 && * SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) * rt->rt_flags |= RTF_CLONING; */ /* XXX should move to route.c? */ if (rt->rt_flags & (RTF_CONNECTED | RTF_LOCAL)) { union { struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_storage ss; } u; /* * Case 1: This route should come from a route to * interface (RTF_CLONING case) or the route should be * treated as on-link but is currently not * (RTF_LLINFO && ln == NULL case). */ if (sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type, NULL, namelen, NULL, addrlen) == NULL) { printf("%s.%d: sockaddr_dl_init(, %zu, ) " "failed on %s\n", __func__, __LINE__, sizeof(u.ss), if_name(ifp)); } rt_setgate(rt, &u.sa); gate = rt->rt_gateway; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if (gate == NULL) { log(LOG_ERR, "%s: rt_setgate failed on %s\n", __func__, if_name(ifp)); break; } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); if ((rt->rt_flags & RTF_CONNECTED) != 0) break; } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * In IPv4 code, we try to annonuce new RTF_ANNOUNCE entry here. * We don't do that here since llinfo is not ready yet. * * There are also couple of other things to be discussed: * - unsolicited NA code needs improvement beforehand * - RFC2461 says we MAY send multicast unsolicited NA * (7.2.6 paragraph 4), however, it also says that we * SHOULD provide a mechanism to prevent multicast NA storm. * we don't have anything like it right now. * note that the mechanism needs a mutual agreement * between proxies, which means that we need to implement * a new protocol, or a new kludge. * - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA. * we need to check ip6forwarding before sending it. * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ #if 0 /* XXX it does not work */ if (rt->rt_flags & RTF_ANNOUNCE) nd6_na_output(ifp, &satocsin6(rt_getkey(rt))->sin6_addr, &satocsin6(rt_getkey(rt))->sin6_addr, ip6_forwarding ? ND_NA_FLAG_ROUTER : 0, 1, NULL); #endif if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) { RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * Address resolution isn't necessary for a point to * point link, so we can skip this test for a p2p link. */ if (gate->sa_family != AF_LINK || gate->sa_len < sockaddr_dl_measure(namelen, addrlen)) { log(LOG_DEBUG, "nd6_rtrequest: bad gateway value: %s\n", if_name(ifp)); break; } satosdl(gate)->sdl_type = ifp->if_type; satosdl(gate)->sdl_index = ifp->if_index; RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); } RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt)); /* * When called from rt_ifa_addlocal, we cannot depend on that * the address (rt_getkey(rt)) exits in the address list of the * interface. So check RTF_LOCAL instead. */ if (rt->rt_flags & RTF_LOCAL) { if (nd6_useloopback) rt->rt_ifp = lo0ifp; /* XXX */ break; } /* * check if rt_getkey(rt) is an address assigned * to the interface. */ ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp, &satocsin6(rt_getkey(rt))->sin6_addr, &psref); if (ifa != NULL) { if (nd6_useloopback) { rt->rt_ifp = lo0ifp; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. * We need this because when we refer * rt_ifa->ia6_flags in ip6_input, we assume * that the rt_ifa points to the address instead * of the loopback address. */ if (!ISSET(info->rti_flags, RTF_DONTCHANGEIFA) && ifa != rt->rt_ifa) rt_replace_ifa(rt, ifa); } } else if (rt->rt_flags & RTF_ANNOUNCE) { /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; int error; llsol = satocsin6(rt_getkey(rt))->sin6_addr; llsol.s6_addr32[0] = htonl(0xff020000); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL)) goto out; if (!in6_addmulti(&llsol, ifp, &error, 0)) { char ip6buf[INET6_ADDRSTRLEN]; nd6log(LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), IN6_PRINT(ip6buf, &llsol), error); } } } out: ifa_release(ifa, &psref); /* * If we have too many cache entries, initiate immediate * purging for some entries. */ if (rt->rt_ifp != NULL) nd6_gc_neighbors(LLTABLE6(rt->rt_ifp), NULL); break; } case RTM_DELETE: /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && (ifp->if_flags & IFF_MULTICAST) != 0) { struct in6_addr llsol; llsol = satocsin6(rt_getkey(rt))->sin6_addr; llsol.s6_addr32[0] = htonl(0xff020000); llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; if (in6_setscope(&llsol, ifp, NULL) == 0) in6_lookup_and_delete_multi(&llsol, ifp); } break; } } static void nd6_setifflags(struct ifnet *ifp, uint32_t flags) { struct nd_kifinfo *ndi = ND_IFINFO(ifp); struct ifaddr *ifa; struct in6_ifaddr *ia; int s; if (ndi->flags & ND6_IFF_IFDISABLED && !(flags & ND6_IFF_IFDISABLED)) { /* * If the interface is marked as ND6_IFF_IFDISABLED and * has a link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, section 5.4.5. */ bool duplicated_linklocal = false; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { duplicated_linklocal = true; break; } } pserialize_read_exit(s); if (duplicated_linklocal) { flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: Cannot enable an interface" " with a link-local address marked" " duplicate.\n", if_name(ifp)); } else { ndi->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ndi->flags & ND6_IFF_IFDISABLED) && (flags & ND6_IFF_IFDISABLED)) { struct psref psref; int bound = curlwp_bind(); /* Mark all IPv6 addresses as tentative. */ ndi->flags |= ND6_IFF_IFDISABLED; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa_acquire(ifa, &psref); pserialize_read_exit(s); nd6_dad_stop(ifa); ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; s = pserialize_read_enter(); ifa_release(ifa, &psref); } pserialize_read_exit(s); curlwp_bindx(bound); } if (flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ndi->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transition */ ndi->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ bool haslinklocal = 0; s = pserialize_read_enter(); IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family !=AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))){ haslinklocal = true; break; } } pserialize_read_exit(s); if (!haslinklocal) in6_ifattach(ifp, NULL); } } ndi->flags = flags; } int nd6_ioctl(u_long cmd, void *data, struct ifnet *ifp) { #ifdef OSIOCGIFINFO_IN6_90 struct in6_ndireq90 *ondi = (struct in6_ndireq90 *)data; struct in6_ndifreq90 *ndif = (struct in6_ndifreq90 *)data; #define OND ondi->ndi #endif struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct nd_kifinfo *ifndi = ND_IFINFO(ifp); int error = 0; #define ND ndi->ndi switch (cmd) { #ifdef OSIOCSRTRFLUSH_IN6 case OSIOCGDRLST_IN6: /* FALLTHROUGH */ case OSIOCGPRLST_IN6: /* FALLTHROUGH */ case OSIOCSNDFLUSH_IN6: /* FALLTHROUGH */ case OSIOCSPFXFLUSH_IN6: /* FALLTHROUGH */ case OSIOCSRTRFLUSH_IN6: /* FALLTHROUGH */ break; case OSIOCGDEFIFACE_IN6: ndif->ifindex = 0; break; case OSIOCSDEFIFACE_IN6: error = ENOTSUP; break; #endif #ifdef OSIOCGIFINFO_IN6 case OSIOCGIFINFO_IN6: /* FALLTHROUGH */ #endif #ifdef OSIOCGIFINFO_IN6_90 case OSIOCGIFINFO_IN6_90: memset(&OND, 0, sizeof(OND)); OND.initialized = 1; OND.chlim = ifndi->chlim; OND.basereachable = ifndi->basereachable; OND.retrans = ifndi->retrans; OND.flags = ifndi->flags; break; case OSIOCSIFINFO_IN6_90: /* Allow userland to set Neighbor Unreachability Detection * timers. */ if (OND.chlim != 0) ifndi->chlim = OND.chlim; if (OND.basereachable != 0 && OND.basereachable != ifndi->basereachable) { ifndi->basereachable = OND.basereachable; ifndi->reachable = ND_COMPUTE_RTIME(OND.basereachable); } if (OND.retrans != 0) ifndi->retrans = OND.retrans; /* Retain the old behaviour .... */ /* FALLTHROUGH */ case OSIOCSIFINFO_FLAGS_90: nd6_setifflags(ifp, OND.flags); break; #undef OND #endif case SIOCGIFINFO_IN6: ND.chlim = ifndi->chlim; ND.basereachable = ifndi->basereachable; ND.retrans = ifndi->retrans; ND.flags = ifndi->flags; break; case SIOCSIFINFO_IN6: /* Allow userland to set Neighbor Unreachability Detection * timers. */ if (ND.chlim != 0) ifndi->chlim = ND.chlim; if (ND.basereachable != 0 && ND.basereachable != ifndi->basereachable) { ifndi->basereachable = ND.basereachable; ifndi->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ifndi->retrans = ND.retrans; break; case SIOCSIFINFO_FLAGS: nd6_setifflags(ifp, ND.flags); break; #undef ND case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return error; ln = nd6_lookup(&nb_addr, ifp, false); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->ln_asked; nbi->isrouter = ln->ln_router; nbi->expire = ln->ln_expire ? time_mono_to_wall(ln->ln_expire) : 0; LLE_RUNLOCK(ln); break; } } return error; } void nd6_llinfo_release_pkts(struct llentry *ln, struct ifnet *ifp) { struct mbuf *m_hold, *m_hold_next; struct sockaddr_in6 sin6; LLE_WLOCK_ASSERT(ln); sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0); m_hold = ln->la_hold, ln->la_hold = NULL, ln->la_numheld = 0; LLE_ADDREF(ln); LLE_WUNLOCK(ln); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* * we assume ifp is not a p2p here, so * just set the 2nd argument as the * 1st one. */ ip6_if_output(ifp, ifp, m_hold, &sin6, NULL); } LLE_WLOCK(ln); LLE_REMREF(ln); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) */ void nd6_cache_lladdr( struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, /* ICMP6 type */ int code /* type dependent information */ ) { struct llentry *ln = NULL; int is_newentry; int do_update; int olladdr; int llchange; int newstate = 0; KASSERT(ifp != NULL); KASSERT(from != NULL); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ ln = nd6_lookup(from, ifp, true); if (ln == NULL) { #if 0 /* nothing must be done if there's no lladdr */ if (!lladdr || !lladdrlen) return NULL; #endif ln = nd6_create(from, ifp); is_newentry = 1; } else { /* do nothing if static ndp is set */ if (ln->la_flags & LLE_STATIC) { LLE_WUNLOCK(ln); return; } is_newentry = 0; } if (ln == NULL) return; olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = memcmp(lladdr, &ln->ll_addr, ifp->if_addrlen); } else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y -- (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ memcpy(&ln->ll_addr, lladdr, ifp->if_addrlen); ln->la_flags |= LLE_VALID; } if (!is_newentry) { if ((!olladdr && lladdr) || /* (3) */ (olladdr && lladdr && llchange)) { /* (5) */ do_update = 1; newstate = ND_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; if (lladdr == NULL) /* (6) */ newstate = ND_LLINFO_NOSTATE; else /* (7) */ newstate = ND_LLINFO_STALE; } if (do_update) { /* * Update the state of the neighbor cache. */ ln->ln_state = newstate; if (ln->ln_state == ND_LLINFO_STALE) { /* * XXX: since nd6_output() below will cause * state tansition to DELAY and reset the timer, * we must set the timer now, although it is actually * meaningless. */ nd_set_timer(ln, ND_TIMER_GC); nd6_llinfo_release_pkts(ln, ifp); } else if (ln->ln_state == ND_LLINFO_INCOMPLETE) { /* probe right away */ nd_set_timer(ln, ND_TIMER_IMMEDIATE); } } /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A question arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * newentry olladdr lladdr llchange NS RS RA redir * D R * 0 n n -- (1) c ? s * 0 y n -- (2) c s s * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln->ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; } if (do_update && lladdr != NULL) { struct sockaddr_in6 sin6; sockaddr_in6_init(&sin6, from, 0, 0, 0); rt_clonedmsg(is_newentry ? RTM_ADD : RTM_CHANGE, NULL, sin6tosa(&sin6), lladdr, ifp); } if (ln != NULL) LLE_WUNLOCK(ln); /* * If we have too many cache entries, initiate immediate * purging for some entries. */ if (is_newentry) nd6_gc_neighbors(LLTABLE6(ifp), &ln->r_l3addr.addr6); } static void nd6_slowtimo(void *ignored_arg) { struct nd_kifinfo *ndi; struct ifnet *ifp; struct psref psref; int s; SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, NULL); s = pserialize_read_enter(); IFNET_READER_FOREACH(ifp) { ndi = ND_IFINFO(ifp); if (ndi->basereachable && /* already initialized */ (ndi->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { if_acquire(ifp, &psref); pserialize_read_exit(s); /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ ndi->recalctm = nd6_recalc_reachtm_interval; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); s = pserialize_read_enter(); if_release(ifp, &psref); } } pserialize_read_exit(s); SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); } /* * Return 0 if a neighbor cache is found. Return EWOULDBLOCK if a cache is not * found and trying to resolve a neighbor; in this case the mbuf is queued in * the list. Otherwise return errno after freeing the mbuf. */ int nd6_resolve(struct ifnet *ifp, const struct rtentry *rt, struct mbuf *m, const struct sockaddr *_dst, uint8_t *lldst, size_t dstsize) { struct llentry *ln = NULL; bool created = false; const struct sockaddr_in6 *dst = satocsin6(_dst); int error; struct nd_kifinfo *ndi = ND_IFINFO(ifp); /* discard the packet if IPv6 operation is disabled on the interface */ if (ndi->flags & ND6_IFF_IFDISABLED) { m_freem(m); return ENETDOWN; /* better error? */ } /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ /* Look up the neighbor cache for the nexthop */ ln = nd6_lookup(&dst->sin6_addr, ifp, false); if (ln != NULL && (ln->la_flags & LLE_VALID) != 0 && ln->ln_state == ND_LLINFO_REACHABLE) { /* Fast path */ memcpy(lldst, &ln->ll_addr, MIN(dstsize, ifp->if_addrlen)); LLE_RUNLOCK(ln); return 0; } if (ln != NULL) LLE_RUNLOCK(ln); /* Slow path */ ln = nd6_lookup(&dst->sin6_addr, ifp, true); if (ln == NULL && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ ln = nd6_create(&dst->sin6_addr, ifp); if (ln == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "%s: can't allocate llinfo for %s " "(ln=%p, rt=%p)\n", __func__, IN6_PRINT(ip6buf, &dst->sin6_addr), ln, rt); m_freem(m); return ENOBUFS; } created = true; } if (ln == NULL) { m_freem(m); return ENETDOWN; /* better error? */ } error = nd_resolve(ln, rt, m, lldst, dstsize); if (created) nd6_gc_neighbors(LLTABLE6(ifp), &dst->sin6_addr); return error; } int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_IEEE1394: case IFT_CARP: case IFT_GIF: /* XXX need more cases? */ case IFT_IPSEC: case IFT_PPP: case IFT_TUNNEL: return 1; default: return 0; } } int nd6_sysctl( int name, void *oldp, /* syscall arg, need copyout */ size_t *oldlenp, void *newp, /* syscall arg, need copyin */ size_t newlen ) { int error; if (newp) return EPERM; switch (name) { /* call the nd6 compat_90 hook to validate the nd6-related names */ case OICMPV6CTL_ND6_DRLIST: /* FALLTHROUGH */ case OICMPV6CTL_ND6_PRLIST: MODULE_HOOK_CALL(net_inet6_nd_90_hook, (name), ENOPROTOOPT, error); if (error == 0) *oldlenp = 0; return error; case ICMPV6CTL_ND6_MAXQLEN: return 0; default: return ENOPROTOOPT; } }
1 2 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 /* $NetBSD: signalvar.h,v 1.104 2021/11/01 05:07:17 thorpej Exp $ */ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)signalvar.h 8.6 (Berkeley) 2/19/95 */ #ifndef _SYS_SIGNALVAR_H_ #define _SYS_SIGNALVAR_H_ #include <sys/siginfo.h> #include <sys/queue.h> #include <sys/mutex.h> #include <sys/stdbool.h> #ifndef _KERNEL #include <string.h> /* Required for memset(3) and memcpy(3) prototypes */ #endif /* _KERNEL */ /* * Kernel signal definitions and data structures, * not exported to user programs. */ /* * Queue of signals. */ typedef TAILQ_HEAD(ksiginfoq, ksiginfo) ksiginfoq_t; /* * Process signal actions, possibly shared between processes. */ struct sigacts { struct sigact_sigdesc { struct sigaction sd_sigact; const void *sd_tramp; int sd_vers; } sa_sigdesc[NSIG]; /* disposition of signals */ int sa_refcnt; /* reference count */ kmutex_t sa_mutex; /* lock on sa_refcnt */ }; /* * Pending signals, per LWP and per process. */ typedef struct sigpend { ksiginfoq_t sp_info; sigset_t sp_set; } sigpend_t; /* * Process signal state. */ struct sigctx { struct _ksiginfo ps_info; /* for core dump/debugger XXX */ int ps_lwp; /* for core dump/debugger XXX */ bool ps_faked; /* for core dump/debugger XXX */ void *ps_sigcode; /* address of signal trampoline */ sigset_t ps_sigignore; /* Signals being ignored. */ sigset_t ps_sigcatch; /* Signals being caught by user. */ sigset_t ps_sigpass; /* Signals evading the debugger. */ }; /* additional signal action values, used only temporarily/internally */ #define SIG_CATCH (void (*)(int))2 /* * get signal action for process and signal; currently only for current process */ #define SIGACTION(p, sig) (p->p_sigacts->sa_sigdesc[(sig)].sd_sigact) #define SIGACTION_PS(ps, sig) (ps->sa_sigdesc[(sig)].sd_sigact) /* * Copy a sigaction structure without padding. */ static __inline void sigaction_copy(struct sigaction *dst, const struct sigaction *src) { memset(dst, 0, sizeof(*dst)); dst->_sa_u._sa_handler = src->_sa_u._sa_handler; memcpy(&dst->sa_mask, &src->sa_mask, sizeof(dst->sa_mask)); dst->sa_flags = src->sa_flags; } /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x0001 /* terminates process by default */ #define SA_CORE 0x0002 /* ditto and coredumps */ #define SA_STOP 0x0004 /* suspend process */ #define SA_TTYSTOP 0x0008 /* ditto, from tty */ #define SA_IGNORE 0x0010 /* ignore by default */ #define SA_CONT 0x0020 /* continue if suspended */ #define SA_CANTMASK 0x0040 /* non-maskable, catchable */ #define SA_NORESET 0x0080 /* not reset when caught */ #define SA_TOLWP 0x0100 /* to LWP that generated, if local */ #define SA_TOALL 0x0200 /* always to all LWPs */ #ifdef _KERNEL #include <sys/systm.h> /* for copyin_t/copyout_t */ extern sigset_t contsigmask, stopsigmask, sigcantmask; struct vnode; struct coredump_iostate; /* * Machine-independent functions: */ int coredump_netbsd(struct lwp *, struct coredump_iostate *); int coredump_netbsd32(struct lwp *, struct coredump_iostate *); int real_coredump_netbsd(struct lwp *, struct coredump_iostate *); void execsigs(struct proc *); int issignal(struct lwp *); void pgsignal(struct pgrp *, int, int); void kpgsignal(struct pgrp *, struct ksiginfo *, void *, int); void postsig(int); void psignal(struct proc *, int); void kpsignal(struct proc *, struct ksiginfo *, void *); void child_psignal(struct proc *, int); void siginit(struct proc *); void trapsignal(struct lwp *, struct ksiginfo *); void sigexit(struct lwp *, int) __dead; void killproc(struct proc *, const char *); void setsigvec(struct proc *, int, struct sigaction *); int killpg1(struct lwp *, struct ksiginfo *, int, int); void proc_unstop(struct proc *p); void eventswitch(int, int, int); void eventswitchchild(struct proc *, int, int); int sigaction1(struct lwp *, int, const struct sigaction *, struct sigaction *, const void *, int); int sigprocmask1(struct lwp *, int, const sigset_t *, sigset_t *); void sigpending1(struct lwp *, sigset_t *); void sigsuspendsetup(struct lwp *, const sigset_t *); void sigsuspendteardown(struct lwp *); int sigsuspend1(struct lwp *, const sigset_t *); int sigaltstack1(struct lwp *, const stack_t *, stack_t *); int sigismasked(struct lwp *, int); int sigget(sigpend_t *, ksiginfo_t *, int, const sigset_t *); void sigclear(sigpend_t *, const sigset_t *, ksiginfoq_t *); void sigclearall(struct proc *, const sigset_t *, ksiginfoq_t *); int kpsignal2(struct proc *, ksiginfo_t *); void signal_init(void); struct sigacts *sigactsinit(struct proc *, int); void sigactsunshare(struct proc *); void sigactsfree(struct sigacts *); void kpsendsig(struct lwp *, const struct ksiginfo *, const sigset_t *); void sendsig_reset(struct lwp *, int); void sendsig(const struct ksiginfo *, const sigset_t *); ksiginfo_t *ksiginfo_alloc(struct proc *, ksiginfo_t *, int); void ksiginfo_free(ksiginfo_t *); void ksiginfo_queue_drain0(ksiginfoq_t *); struct sys_____sigtimedwait50_args; int sigtimedwait1(struct lwp *, const struct sys_____sigtimedwait50_args *, register_t *, copyin_t, copyout_t, copyin_t, copyout_t); void signotify(struct lwp *); int sigispending(struct lwp *, int); /* * Machine-dependent functions: */ void sendsig_sigcontext(const struct ksiginfo *, const sigset_t *); void sendsig_siginfo(const struct ksiginfo *, const sigset_t *); extern struct pool ksiginfo_pool; /* * firstsig: * * Return the first signal in a signal set. */ static __inline int firstsig(const sigset_t *ss) { int sig; sig = ffs(ss->__bits[0]); if (sig != 0) return (sig); #if NSIG > 33 sig = ffs(ss->__bits[1]); if (sig != 0) return (sig + 32); #endif #if NSIG > 65 sig = ffs(ss->__bits[2]); if (sig != 0) return (sig + 64); #endif #if NSIG > 97 sig = ffs(ss->__bits[3]); if (sig != 0) return (sig + 96); #endif return (0); } static __inline void ksiginfo_queue_init(ksiginfoq_t *kq) { TAILQ_INIT(kq); } static __inline void ksiginfo_queue_drain(ksiginfoq_t *kq) { if (!TAILQ_EMPTY(kq)) ksiginfo_queue_drain0(kq); } #endif /* _KERNEL */ #ifdef _KERNEL #ifdef SIGPROP const int sigprop[NSIG] = { 0, /* 0 unused */ SA_KILL, /* 1 SIGHUP */ SA_KILL, /* 2 SIGINT */ SA_KILL|SA_CORE, /* 3 SIGQUIT */ SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP, /* 4 SIGILL */ SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP, /* 5 SIGTRAP */ SA_KILL|SA_CORE, /* 6 SIGABRT */ SA_KILL|SA_CORE|SA_TOLWP, /* 7 SIGEMT */ SA_KILL|SA_CORE|SA_TOLWP, /* 8 SIGFPE */ SA_KILL|SA_CANTMASK|SA_TOALL, /* 9 SIGKILL */ SA_KILL|SA_CORE|SA_TOLWP, /* 10 SIGBUS */ SA_KILL|SA_CORE|SA_TOLWP, /* 11 SIGSEGV */ SA_KILL|SA_CORE|SA_TOLWP, /* 12 SIGSYS */ SA_KILL, /* 13 SIGPIPE */ SA_KILL, /* 14 SIGALRM */ SA_KILL, /* 15 SIGTERM */ SA_IGNORE, /* 16 SIGURG */ SA_STOP|SA_CANTMASK|SA_TOALL, /* 17 SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 18 SIGTSTP */ SA_IGNORE|SA_CONT|SA_TOALL, /* 19 SIGCONT */ SA_IGNORE, /* 20 SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 21 SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_TOALL, /* 22 SIGTTOU */ SA_IGNORE, /* 23 SIGIO */ SA_KILL, /* 24 SIGXCPU */ SA_KILL, /* 25 SIGXFSZ */ SA_KILL, /* 26 SIGVTALRM */ SA_KILL, /* 27 SIGPROF */ SA_IGNORE, /* 28 SIGWINCH */ SA_IGNORE, /* 29 SIGINFO */ SA_KILL, /* 30 SIGUSR1 */ SA_KILL, /* 31 SIGUSR2 */ SA_IGNORE|SA_NORESET, /* 32 SIGPWR */ SA_KILL, /* 33 SIGRTMIN + 0 */ SA_KILL, /* 34 SIGRTMIN + 1 */ SA_KILL, /* 35 SIGRTMIN + 2 */ SA_KILL, /* 36 SIGRTMIN + 3 */ SA_KILL, /* 37 SIGRTMIN + 4 */ SA_KILL, /* 38 SIGRTMIN + 5 */ SA_KILL, /* 39 SIGRTMIN + 6 */ SA_KILL, /* 40 SIGRTMIN + 7 */ SA_KILL, /* 41 SIGRTMIN + 8 */ SA_KILL, /* 42 SIGRTMIN + 9 */ SA_KILL, /* 43 SIGRTMIN + 10 */ SA_KILL, /* 44 SIGRTMIN + 11 */ SA_KILL, /* 45 SIGRTMIN + 12 */ SA_KILL, /* 46 SIGRTMIN + 13 */ SA_KILL, /* 47 SIGRTMIN + 14 */ SA_KILL, /* 48 SIGRTMIN + 15 */ SA_KILL, /* 49 SIGRTMIN + 16 */ SA_KILL, /* 50 SIGRTMIN + 17 */ SA_KILL, /* 51 SIGRTMIN + 18 */ SA_KILL, /* 52 SIGRTMIN + 19 */ SA_KILL, /* 53 SIGRTMIN + 20 */ SA_KILL, /* 54 SIGRTMIN + 21 */ SA_KILL, /* 55 SIGRTMIN + 22 */ SA_KILL, /* 56 SIGRTMIN + 23 */ SA_KILL, /* 57 SIGRTMIN + 24 */ SA_KILL, /* 58 SIGRTMIN + 25 */ SA_KILL, /* 59 SIGRTMIN + 26 */ SA_KILL, /* 60 SIGRTMIN + 27 */ SA_KILL, /* 61 SIGRTMIN + 28 */ SA_KILL, /* 62 SIGRTMIN + 29 */ SA_KILL, /* 63 SIGRTMIN + 30 */ }; #undef SIGPROP #else extern const int sigprop[NSIG]; #endif /* SIGPROP */ #endif /* _KERNEL */ #endif /* !_SYS_SIGNALVAR_H_ */
57 58 282 282 284 199 199 199 164 40 50 50 50 50 3 47 50 16 31 31 31 20 20 3 3 81 82 847 840 14 14 23 23 116 115 14 14 4 4 4 38 38 38 6 6 6 2 2 2 31 31 31 2 2 2 7 26 7 22 5 36 36 17 19 29 29 23 24 24 30 30 30 1 2 24 3 3 3 1 2 2 2 19 20 20 20 686 832 829 823 121 838 841 792 822 171 501 34 243 245 123 125 171 169 2 2 2 2 8 9 95 96 499 500 480 38 77 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 /* $NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $ */ /*- * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/queue.h> #include <sys/proc.h> #include <sys/ucred.h> #include <sys/pool.h> #define __KAUTH_PRIVATE #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/rwlock.h> #include <sys/sysctl.h> #include <sys/atomic.h> #include <sys/specificdata.h> #include <sys/vnode.h> #include <secmodel/secmodel.h> /* * Secmodel-specific credentials. */ struct kauth_key { secmodel_t ks_secmodel; /* secmodel */ specificdata_key_t ks_key; /* key */ }; /* * Listener. */ struct kauth_listener { kauth_scope_callback_t func; /* callback */ kauth_scope_t scope; /* scope backpointer */ u_int refcnt; /* reference count */ SIMPLEQ_ENTRY(kauth_listener) listener_next; /* listener list */ }; /* * Scope. */ struct kauth_scope { const char *id; /* scope name */ void *cookie; /* user cookie */ u_int nlisteners; /* # of listeners */ SIMPLEQ_HEAD(, kauth_listener) listenq; /* listener list */ SIMPLEQ_ENTRY(kauth_scope) next_scope; /* scope list */ }; static int kauth_cred_hook(kauth_cred_t, kauth_action_t, void *, void *); /* List of scopes and its lock. */ static SIMPLEQ_HEAD(, kauth_scope) scope_list = SIMPLEQ_HEAD_INITIALIZER(scope_list); /* Built-in scopes: generic, process. */ static kauth_scope_t kauth_builtin_scope_generic; static kauth_scope_t kauth_builtin_scope_system; static kauth_scope_t kauth_builtin_scope_process; static kauth_scope_t kauth_builtin_scope_network; static kauth_scope_t kauth_builtin_scope_machdep; static kauth_scope_t kauth_builtin_scope_device; static kauth_scope_t kauth_builtin_scope_cred; static kauth_scope_t kauth_builtin_scope_vnode; static specificdata_domain_t kauth_domain; static pool_cache_t kauth_cred_cache; krwlock_t kauth_lock; /* Allocate new, empty kauth credentials. */ kauth_cred_t kauth_cred_alloc(void) { kauth_cred_t cred; cred = pool_cache_get(kauth_cred_cache, PR_WAITOK); cred->cr_refcnt = 1; cred->cr_uid = 0; cred->cr_euid = 0; cred->cr_svuid = 0; cred->cr_gid = 0; cred->cr_egid = 0; cred->cr_svgid = 0; cred->cr_ngroups = 0; specificdata_init(kauth_domain, &cred->cr_sd); kauth_cred_hook(cred, KAUTH_CRED_INIT, NULL, NULL); return (cred); } /* Increment reference count to cred. */ kauth_cred_t kauth_cred_hold(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); atomic_inc_uint(&cred->cr_refcnt); return cred; } /* Decrease reference count to cred. If reached zero, free it. */ void kauth_cred_free(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); ASSERT_SLEEPABLE(); membar_release(); if (atomic_dec_uint_nv(&cred->cr_refcnt) > 0) return; membar_acquire(); kauth_cred_hook(cred, KAUTH_CRED_FREE, NULL, NULL); specificdata_fini(kauth_domain, &cred->cr_sd); pool_cache_put(kauth_cred_cache, cred); } static void kauth_cred_clone1(kauth_cred_t from, kauth_cred_t to, bool copy_groups) { KASSERT(from != NULL); KASSERT(from != NOCRED); KASSERT(from != FSCRED); KASSERT(to != NULL); KASSERT(to != NOCRED); KASSERT(to != FSCRED); KASSERT(from->cr_refcnt > 0); to->cr_uid = from->cr_uid; to->cr_euid = from->cr_euid; to->cr_svuid = from->cr_svuid; to->cr_gid = from->cr_gid; to->cr_egid = from->cr_egid; to->cr_svgid = from->cr_svgid; if (copy_groups) { to->cr_ngroups = from->cr_ngroups; memcpy(to->cr_groups, from->cr_groups, sizeof(to->cr_groups)); } kauth_cred_hook(from, KAUTH_CRED_COPY, to, NULL); } void kauth_cred_clone(kauth_cred_t from, kauth_cred_t to) { kauth_cred_clone1(from, to, true); } /* * Duplicate cred and return a new kauth_cred_t. */ kauth_cred_t kauth_cred_dup(kauth_cred_t cred) { kauth_cred_t new_cred; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); new_cred = kauth_cred_alloc(); kauth_cred_clone(cred, new_cred); return (new_cred); } /* * Similar to crcopy(), only on a kauth_cred_t. * XXX: Is this even needed? [kauth_cred_copy] */ kauth_cred_t kauth_cred_copy(kauth_cred_t cred) { kauth_cred_t new_cred; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt > 0); /* If the provided credentials already have one reference, use them. */ if (cred->cr_refcnt == 1) return (cred); new_cred = kauth_cred_alloc(); kauth_cred_clone(cred, new_cred); kauth_cred_free(cred); return (new_cred); } void kauth_proc_fork(struct proc *parent, struct proc *child) { mutex_enter(parent->p_lock); child->p_cred = kauth_cred_hold(parent->p_cred); mutex_exit(parent->p_lock); /* XXX: relies on parent process stalling during fork() */ kauth_cred_hook(parent->p_cred, KAUTH_CRED_FORK, parent, child); } void kauth_proc_chroot(kauth_cred_t cred, struct cwdinfo *cwdi) { kauth_cred_hook(cred, KAUTH_CRED_CHROOT, cwdi, NULL); } uid_t kauth_cred_getuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_uid); } uid_t kauth_cred_geteuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_euid); } uid_t kauth_cred_getsvuid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_svuid); } gid_t kauth_cred_getgid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_gid); } gid_t kauth_cred_getegid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_egid); } gid_t kauth_cred_getsvgid(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_svgid); } void kauth_cred_setuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_uid = uid; } void kauth_cred_seteuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_euid = uid; } void kauth_cred_setsvuid(kauth_cred_t cred, uid_t uid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_svuid = uid; } void kauth_cred_setgid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_gid = gid; } void kauth_cred_setegid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_egid = gid; } void kauth_cred_setsvgid(kauth_cred_t cred, gid_t gid) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); cred->cr_svgid = gid; } /* Checks if gid is a member of the groups in cred. */ int kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) { uint32_t i; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(resultp != NULL); *resultp = 0; for (i = 0; i < cred->cr_ngroups; i++) if (cred->cr_groups[i] == gid) { *resultp = 1; break; } return (0); } int kauth_cred_groupmember(kauth_cred_t cred, gid_t gid) { int ismember, error; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); if (kauth_cred_getegid(cred) == gid) return 0; error = kauth_cred_ismember_gid(cred, gid, &ismember); if (error) return error; return ismember ? 0 : -1; } u_int kauth_cred_ngroups(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_ngroups); } /* * Return the group at index idx from the groups in cred. */ gid_t kauth_cred_group(kauth_cred_t cred, u_int idx) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(idx < cred->cr_ngroups); return (cred->cr_groups[idx]); } /* XXX elad: gmuid is unused for now. */ int kauth_cred_setgroups(kauth_cred_t cred, const gid_t *grbuf, size_t len, uid_t gmuid, enum uio_seg seg) { int error = 0; KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(cred->cr_refcnt == 1); if (len > __arraycount(cred->cr_groups)) return EINVAL; if (len) { if (seg == UIO_SYSSPACE) { memcpy(cred->cr_groups, grbuf, len * sizeof(cred->cr_groups[0])); } else { error = copyin(grbuf, cred->cr_groups, len * sizeof(cred->cr_groups[0])); if (error != 0) len = 0; } } memset(cred->cr_groups + len, 0xff, sizeof(cred->cr_groups) - (len * sizeof(cred->cr_groups[0]))); cred->cr_ngroups = len; return error; } /* This supports sys_setgroups() */ int kauth_proc_setgroups(struct lwp *l, kauth_cred_t ncred) { kauth_cred_t cred; int error; /* * At this point we could delete duplicate groups from ncred, * and plausibly sort the list - but in general the later is * a bad idea. */ proc_crmod_enter(); /* Maybe we should use curproc here ? */ cred = l->l_proc->p_cred; kauth_cred_clone1(cred, ncred, false); error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID, l->l_proc, NULL, NULL, NULL); if (error != 0) { proc_crmod_leave(cred, ncred, false); return error; } /* Broadcast our credentials to the process and other LWPs. */ proc_crmod_leave(ncred, cred, true); return 0; } int kauth_cred_getgroups(kauth_cred_t cred, gid_t *grbuf, size_t len, enum uio_seg seg) { KASSERT(cred != NULL); if (len > cred->cr_ngroups) return EINVAL; if (seg == UIO_USERSPACE) return copyout(cred->cr_groups, grbuf, sizeof(*grbuf) * len); memcpy(grbuf, cred->cr_groups, sizeof(*grbuf) * len); return 0; } int kauth_register_key(secmodel_t secmodel, kauth_key_t *result) { kauth_key_t k; specificdata_key_t key; int error; KASSERT(result != NULL); error = specificdata_key_create(kauth_domain, &key, NULL); if (error) return (error); k = kmem_alloc(sizeof(*k), KM_SLEEP); k->ks_secmodel = secmodel; k->ks_key = key; *result = k; return (0); } int kauth_deregister_key(kauth_key_t key) { KASSERT(key != NULL); specificdata_key_delete(kauth_domain, key->ks_key); kmem_free(key, sizeof(*key)); return (0); } void * kauth_cred_getdata(kauth_cred_t cred, kauth_key_t key) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(key != NULL); return (specificdata_getspecific(kauth_domain, &cred->cr_sd, key->ks_key)); } void kauth_cred_setdata(kauth_cred_t cred, kauth_key_t key, void *data) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(key != NULL); specificdata_setspecific(kauth_domain, &cred->cr_sd, key->ks_key, data); } /* * Match uids in two credentials. */ int kauth_cred_uidmatch(kauth_cred_t cred1, kauth_cred_t cred2) { KASSERT(cred1 != NULL); KASSERT(cred1 != NOCRED); KASSERT(cred1 != FSCRED); KASSERT(cred2 != NULL); KASSERT(cred2 != NOCRED); KASSERT(cred2 != FSCRED); if (cred1->cr_uid == cred2->cr_uid || cred1->cr_euid == cred2->cr_uid || cred1->cr_uid == cred2->cr_euid || cred1->cr_euid == cred2->cr_euid) return (1); return (0); } u_int kauth_cred_getrefcnt(kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); return (cred->cr_refcnt); } /* * Convert userland credentials (struct uucred) to kauth_cred_t. * XXX: For NFS & puffs */ void kauth_uucred_to_cred(kauth_cred_t cred, const struct uucred *uuc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); cred->cr_refcnt = 1; cred->cr_uid = uuc->cr_uid; cred->cr_euid = uuc->cr_uid; cred->cr_svuid = uuc->cr_uid; cred->cr_gid = uuc->cr_gid; cred->cr_egid = uuc->cr_gid; cred->cr_svgid = uuc->cr_gid; cred->cr_ngroups = uimin(uuc->cr_ngroups, NGROUPS); kauth_cred_setgroups(cred, __UNCONST(uuc->cr_groups), cred->cr_ngroups, -1, UIO_SYSSPACE); } /* * Convert kauth_cred_t to userland credentials (struct uucred). * XXX: For NFS & puffs */ void kauth_cred_to_uucred(struct uucred *uuc, const kauth_cred_t cred) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); int ng; ng = uimin(cred->cr_ngroups, NGROUPS); uuc->cr_uid = cred->cr_euid; uuc->cr_gid = cred->cr_egid; uuc->cr_ngroups = ng; kauth_cred_getgroups(cred, uuc->cr_groups, ng, UIO_SYSSPACE); } /* * Compare kauth_cred_t and uucred credentials. * XXX: Modelled after crcmp() for NFS. */ int kauth_cred_uucmp(kauth_cred_t cred, const struct uucred *uuc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uuc != NULL); if (cred->cr_euid == uuc->cr_uid && cred->cr_egid == uuc->cr_gid && cred->cr_ngroups == (uint32_t)uuc->cr_ngroups) { int i; /* Check if all groups from uuc appear in cred. */ for (i = 0; i < uuc->cr_ngroups; i++) { int ismember; ismember = 0; if (kauth_cred_ismember_gid(cred, uuc->cr_groups[i], &ismember) != 0 || !ismember) return (1); } return (0); } return (1); } /* * Make a struct ucred out of a kauth_cred_t. For compatibility. */ void kauth_cred_toucred(kauth_cred_t cred, struct ki_ucred *uc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(uc != NULL); uc->cr_ref = cred->cr_refcnt; uc->cr_uid = cred->cr_euid; uc->cr_gid = cred->cr_egid; uc->cr_ngroups = uimin(cred->cr_ngroups, __arraycount(uc->cr_groups)); memcpy(uc->cr_groups, cred->cr_groups, uc->cr_ngroups * sizeof(uc->cr_groups[0])); } /* * Make a struct pcred out of a kauth_cred_t. For compatibility. */ void kauth_cred_topcred(kauth_cred_t cred, struct ki_pcred *pc) { KASSERT(cred != NULL); KASSERT(cred != NOCRED); KASSERT(cred != FSCRED); KASSERT(pc != NULL); pc->p_pad = NULL; pc->p_ruid = cred->cr_uid; pc->p_svuid = cred->cr_svuid; pc->p_rgid = cred->cr_gid; pc->p_svgid = cred->cr_svgid; pc->p_refcnt = cred->cr_refcnt; } /* * Return kauth_cred_t for the current LWP. */ kauth_cred_t kauth_cred_get(void) { return (curlwp->l_cred); } /* * Returns a scope matching the provided id. * Requires the scope list lock to be held by the caller. */ static kauth_scope_t kauth_ifindscope(const char *id) { kauth_scope_t scope; KASSERT(rw_lock_held(&kauth_lock)); scope = NULL; SIMPLEQ_FOREACH(scope, &scope_list, next_scope) { if (strcmp(scope->id, id) == 0) break; } return (scope); } /* * Register a new scope. * * id - identifier for the scope * callback - the scope's default listener * cookie - cookie to be passed to the listener(s) */ kauth_scope_t kauth_register_scope(const char *id, kauth_scope_callback_t callback, void *cookie) { kauth_scope_t scope; kauth_listener_t listener = NULL; /* XXX gcc */ /* Sanitize input */ if (id == NULL) return (NULL); /* Allocate space for a new scope and listener. */ scope = kmem_alloc(sizeof(*scope), KM_SLEEP); if (callback != NULL) listener = kmem_alloc(sizeof(*listener), KM_SLEEP); /* * Acquire scope list lock. */ rw_enter(&kauth_lock, RW_WRITER); /* Check we don't already have a scope with the same id */ if (kauth_ifindscope(id) != NULL) { rw_exit(&kauth_lock); kmem_free(scope, sizeof(*scope)); if (callback != NULL) kmem_free(listener, sizeof(*listener)); return (NULL); } /* Initialize new scope with parameters */ scope->id = id; scope->cookie = cookie; scope->nlisteners = 1; SIMPLEQ_INIT(&scope->listenq); /* Add default listener */ if (callback != NULL) { listener->func = callback; listener->scope = scope; listener->refcnt = 0; SIMPLEQ_INSERT_HEAD(&scope->listenq, listener, listener_next); } /* Insert scope to scopes list */ SIMPLEQ_INSERT_TAIL(&scope_list, scope, next_scope); rw_exit(&kauth_lock); return (scope); } /* * Initialize the kernel authorization subsystem. * * Initialize the scopes list lock. * Create specificdata domain. * Register the credentials scope, used in kauth(9) internally. * Register built-in scopes: generic, system, process, network, machdep, device. */ void kauth_init(void) { rw_init(&kauth_lock); kauth_cred_cache = pool_cache_init(sizeof(struct kauth_cred), coherency_unit, 0, 0, "kcredpl", NULL, IPL_NONE, NULL, NULL, NULL); /* Create specificdata domain. */ kauth_domain = specificdata_domain_create(); /* Register credentials scope. */ kauth_builtin_scope_cred = kauth_register_scope(KAUTH_SCOPE_CRED, NULL, NULL); /* Register generic scope. */ kauth_builtin_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC, NULL, NULL); /* Register system scope. */ kauth_builtin_scope_system = kauth_register_scope(KAUTH_SCOPE_SYSTEM, NULL, NULL); /* Register process scope. */ kauth_builtin_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS, NULL, NULL); /* Register network scope. */ kauth_builtin_scope_network = kauth_register_scope(KAUTH_SCOPE_NETWORK, NULL, NULL); /* Register machdep scope. */ kauth_builtin_scope_machdep = kauth_register_scope(KAUTH_SCOPE_MACHDEP, NULL, NULL); /* Register device scope. */ kauth_builtin_scope_device = kauth_register_scope(KAUTH_SCOPE_DEVICE, NULL, NULL); /* Register vnode scope. */ kauth_builtin_scope_vnode = kauth_register_scope(KAUTH_SCOPE_VNODE, NULL, NULL); } /* * Deregister a scope. * Requires scope list lock to be held by the caller. * * scope - the scope to deregister */ void kauth_deregister_scope(kauth_scope_t scope) { if (scope != NULL) { /* Remove scope from list */ SIMPLEQ_REMOVE(&scope_list, scope, kauth_scope, next_scope); kmem_free(scope, sizeof(*scope)); } } /* * Register a listener. * * id - scope identifier. * callback - the callback routine for the listener. * cookie - cookie to pass unmoidfied to the callback. */ kauth_listener_t kauth_listen_scope(const char *id, kauth_scope_callback_t callback, void *cookie) { kauth_scope_t scope; kauth_listener_t listener; listener = kmem_alloc(sizeof(*listener), KM_SLEEP); rw_enter(&kauth_lock, RW_WRITER); /* * Find scope struct. */ scope = kauth_ifindscope(id); if (scope == NULL) { rw_exit(&kauth_lock); kmem_free(listener, sizeof(*listener)); return (NULL); } /* Allocate listener */ /* Initialize listener with parameters */ listener->func = callback; listener->refcnt = 0; /* Add listener to scope */ SIMPLEQ_INSERT_TAIL(&scope->listenq, listener, listener_next); /* Raise number of listeners on scope. */ scope->nlisteners++; listener->scope = scope; rw_exit(&kauth_lock); return (listener); } /* * Deregister a listener. * * listener - listener reference as returned from kauth_listen_scope(). */ void kauth_unlisten_scope(kauth_listener_t listener) { if (listener != NULL) { rw_enter(&kauth_lock, RW_WRITER); SIMPLEQ_REMOVE(&listener->scope->listenq, listener, kauth_listener, listener_next); listener->scope->nlisteners--; rw_exit(&kauth_lock); kmem_free(listener, sizeof(*listener)); } } /* * Authorize a request. * * scope - the scope of the request as defined by KAUTH_SCOPE_* or as * returned from kauth_register_scope(). * credential - credentials of the user ("actor") making the request. * action - request identifier. * arg[0-3] - passed unmodified to listener(s). * * Returns the aggregated result: * - KAUTH_RESULT_ALLOW if there is at least one KAUTH_RESULT_ALLOW and * zero KAUTH_DESULT_DENY * - KAUTH_RESULT_DENY if there is at least one KAUTH_RESULT_DENY * - KAUTH_RESULT_DEFER if there is nothing but KAUTH_RESULT_DEFER */ static int kauth_authorize_action_internal(kauth_scope_t scope, kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { kauth_listener_t listener; int error, allow, fail; KASSERT(cred != NULL); KASSERT(action != 0); /* Short-circuit requests coming from the kernel. */ if (cred == NOCRED || cred == FSCRED) return KAUTH_RESULT_ALLOW; KASSERT(scope != NULL); fail = 0; allow = 0; /* rw_enter(&kauth_lock, RW_READER); XXX not yet */ SIMPLEQ_FOREACH(listener, &scope->listenq, listener_next) { error = listener->func(cred, action, scope->cookie, arg0, arg1, arg2, arg3); if (error == KAUTH_RESULT_ALLOW) allow = 1; else if (error == KAUTH_RESULT_DENY) fail = 1; } /* rw_exit(&kauth_lock); */ if (fail) return (KAUTH_RESULT_DENY); if (allow) return (KAUTH_RESULT_ALLOW); return (KAUTH_RESULT_DEFER); }; int kauth_authorize_action(kauth_scope_t scope, kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { int r; r = kauth_authorize_action_internal(scope, cred, action, arg0, arg1, arg2, arg3); if (r == KAUTH_RESULT_DENY) return (EPERM); if (r == KAUTH_RESULT_ALLOW) return (0); if (secmodel_nsecmodels() == 0) return (0); return (EPERM); } /* * Generic scope authorization wrapper. */ int kauth_authorize_generic(kauth_cred_t cred, kauth_action_t action, void *arg0) { return (kauth_authorize_action(kauth_builtin_scope_generic, cred, action, arg0, NULL, NULL, NULL)); } /* * System scope authorization wrapper. */ int kauth_authorize_system(kauth_cred_t cred, kauth_action_t action, enum kauth_system_req req, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_system, cred, action, (void *)req, arg1, arg2, arg3)); } /* * Process scope authorization wrapper. */ int kauth_authorize_process(kauth_cred_t cred, kauth_action_t action, struct proc *p, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_process, cred, action, p, arg1, arg2, arg3)); } /* * Network scope authorization wrapper. */ int kauth_authorize_network(kauth_cred_t cred, kauth_action_t action, enum kauth_network_req req, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_network, cred, action, (void *)req, arg1, arg2, arg3)); } int kauth_authorize_machdep(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_machdep, cred, action, arg0, arg1, arg2, arg3)); } int kauth_authorize_device(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, action, arg0, arg1, arg2, arg3)); } int kauth_authorize_device_tty(kauth_cred_t cred, kauth_action_t action, struct tty *tty) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, action, tty, NULL, NULL, NULL)); } int kauth_authorize_device_spec(kauth_cred_t cred, enum kauth_device_req req, struct vnode *vp) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, KAUTH_DEVICE_RAWIO_SPEC, (void *)req, vp, NULL, NULL)); } int kauth_authorize_device_passthru(kauth_cred_t cred, dev_t dev, u_long bits, void *data) { return (kauth_authorize_action(kauth_builtin_scope_device, cred, KAUTH_DEVICE_RAWIO_PASSTHRU, (void *)bits, (void *)(u_long)dev, data, NULL)); } kauth_action_t kauth_accmode_to_action(accmode_t accmode) { kauth_action_t action = 0; // XXX: Revisit we need to have a richer set of kauth primitives // We also get only the Unix perms here sometimes if (accmode & (VSTAT_PERMS|VREAD)) action |= KAUTH_VNODE_READ_DATA; if (accmode & (VMODIFY_PERMS|VADMIN_PERMS)) action |= KAUTH_VNODE_WRITE_DATA; if (accmode & VEXEC) action |= KAUTH_VNODE_EXECUTE; return action == 0 ? KAUTH_VNODE_ACCESS : action; } kauth_action_t kauth_extattr_action(mode_t access_mode) { kauth_action_t action = 0; if (access_mode & VREAD) action |= KAUTH_VNODE_READ_EXTATTRIBUTES; if (access_mode & VWRITE) action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; return action; } int kauth_authorize_vnode(kauth_cred_t cred, kauth_action_t action, struct vnode *vp, struct vnode *dvp, int fs_decision) { int error; error = kauth_authorize_action_internal(kauth_builtin_scope_vnode, cred, action, vp, dvp, NULL, NULL); if (error == KAUTH_RESULT_DENY) return (EACCES); if (error == KAUTH_RESULT_ALLOW) return (0); /* * If the file-system does not support decision-before-action, we can * only short-circuit the operation (deny). If we're here, it means no * listener denied it, so our only alternative is to supposedly-allow * it and let the file-system have the last word. */ if (fs_decision == KAUTH_VNODE_REMOTEFS) return (0); return (fs_decision); } static int kauth_cred_hook(kauth_cred_t cred, kauth_action_t action, void *arg0, void *arg1) { int r; r = kauth_authorize_action(kauth_builtin_scope_cred, cred, action, arg0, arg1, NULL, NULL); #ifdef DIAGNOSTIC if (!SIMPLEQ_EMPTY(&kauth_builtin_scope_cred->listenq)) KASSERT(r == 0); #endif /* DIAGNOSTIC */ return (r); }
4 4 4 1 1 1 1 1 3 4 3 3 3 3 3 3 3 2 2 2 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 /* $NetBSD: if_llatbl.c,v 1.35 2022/11/19 08:00:51 yamt Exp $ */ /* * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. * Copyright (c) 2004-2008 Qing Li. All rights reserved. * Copyright (c) 2008 Kip Macy. All rights reserved. * Copyright (c) 2015 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #ifdef _KERNEL_OPT #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_net_mpsafe.h" #endif #include "arp.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/syslog.h> #include <sys/sysctl.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/rwlock.h> #ifdef DDB #include <ddb/ddb.h> #endif #include <netinet/in.h> #include <net/if_llatbl.h> #include <net/if.h> #include <net/if_dl.h> #include <net/nd.h> #include <net/route.h> #include <netinet/if_inarp.h> #include <netinet/in_var.h> #include <netinet6/in6_var.h> static SLIST_HEAD(, lltable) lltables; krwlock_t lltable_rwlock; static struct pool llentry_pool; static void lltable_unlink(struct lltable *llt); static void llentries_unlink(struct lltable *llt, struct llentries *head); static void htable_unlink_entry(struct llentry *lle); static void htable_link_entry(struct lltable *llt, struct llentry *lle); static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); int lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct rt_walkarg *w, struct sockaddr *sa) { #define RTF_LLINFO 0x400 #define RTF_CLONED 0x2000 struct ifnet *ifp = llt->llt_ifp; int error; void *a; struct sockaddr_dl sdl; int size; struct rt_addrinfo info; memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = sa; a = (lle->la_flags & LLE_VALID) == LLE_VALID ? &lle->ll_addr : NULL; if (sockaddr_dl_init(&sdl, sizeof(sdl), ifp->if_index, ifp->if_type, NULL, 0, a, ifp->if_addrlen) == NULL) return EINVAL; info.rti_info[RTAX_GATEWAY] = sstocsa(&sdl); if (sa->sa_family == AF_INET && lle->la_flags & LLE_PUB) { struct sockaddr_inarp *sin; sin = (struct sockaddr_inarp *)sa; sin->sin_other = SIN_PROXY; } if ((error = rt_msg3(RTM_GET, &info, 0, w, &size))) return error; if (w->w_where && w->w_tmem && w->w_needed <= 0) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; /* Need to copy by myself */ rtm->rtm_index = ifp->if_index; rtm->rtm_rmx.rmx_mtu = 0; rtm->rtm_rmx.rmx_expire = (lle->la_flags & LLE_STATIC) ? 0 : time_mono_to_wall(lle->la_expire); rtm->rtm_flags = RTF_UP; rtm->rtm_flags |= RTF_HOST; /* For ndp */ /* For backward compatibility */ rtm->rtm_flags |= RTF_LLINFO | RTF_CLONED; rtm->rtm_flags |= (lle->la_flags & LLE_STATIC) ? RTF_STATIC : 0; if (lle->la_flags & LLE_PUB) rtm->rtm_flags |= RTF_ANNOUNCE; rtm->rtm_addrs = info.rti_addrs; if ((error = copyout(rtm, w->w_where, size)) != 0) w->w_where = NULL; else w->w_where = (char *)w->w_where + size; } return error; #undef RTF_LLINFO #undef RTF_CLONED } /* * Dump lle state for a specific address family. */ static int lltable_dump_af(struct lltable *llt, struct rt_walkarg *w) { int error; LLTABLE_LOCK_ASSERT(); if (llt->llt_ifp->if_flags & IFF_LOOPBACK) return (0); error = 0; IF_AFDATA_RLOCK(llt->llt_ifp); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, w); IF_AFDATA_RUNLOCK(llt->llt_ifp); return (error); } /* * Dump arp state for a specific address family. */ int lltable_sysctl_dump(int af, struct rt_walkarg *w) { struct lltable *llt; int error = 0; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af == af) { error = lltable_dump_af(llt, w); if (error != 0) goto done; } } done: LLTABLE_RUNLOCK(); return (error); } /* * Common function helpers for chained hash table. */ /* * Runs specified callback for each entry in @llt. * Caller does the locking. * */ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { struct llentry *lle, *next; int i, error; error = 0; for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { error = f(llt, lle, farg); if (error != 0) break; } } return (error); } static void htable_link_entry(struct lltable *llt, struct llentry *lle) { struct llentries *lleh; uint32_t hashidx; if ((lle->la_flags & LLE_LINKED) != 0) return; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); hashidx = llt->llt_hash(lle, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); llt->llt_lle_count++; } static void htable_unlink_entry(struct llentry *lle) { if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); LIST_REMOVE(lle, lle_next); lle->la_flags &= ~(LLE_VALID | LLE_LINKED); #if 0 lle->lle_tbl = NULL; lle->lle_head = NULL; #endif KASSERTMSG(lle->lle_tbl->llt_lle_count != 0, "llt_lle_count=%u", lle->lle_tbl->llt_lle_count); lle->lle_tbl->llt_lle_count--; } } struct prefix_match_data { const struct sockaddr *prefix; const struct sockaddr *mask; struct llentries dchain; u_int flags; }; static int htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct prefix_match_data *pmd; pmd = (struct prefix_match_data *)farg; if (llt->llt_match_prefix(pmd->prefix, pmd->mask, pmd->flags, lle)) { LLE_WLOCK(lle); LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain); } return (0); } static void htable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { struct llentry *lle, *next; struct prefix_match_data pmd; memset(&pmd, 0, sizeof(pmd)); pmd.prefix = prefix; pmd.mask = mask; pmd.flags = flags; LIST_INIT(&pmd.dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push matching lles to chain */ lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd); llentries_unlink(llt, &pmd.dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next) llt->llt_free_entry(llt, lle); } static void htable_free_tbl(struct lltable *llt) { free(llt->lle_head, M_LLTABLE); free(llt, M_LLTABLE); } static void llentries_unlink(struct lltable *llt, struct llentries *head) { struct llentry *lle, *next; LIST_FOREACH_SAFE(lle, head, lle_chain, next) llt->llt_unlink_entry(lle); } /* * Helper function used to drop all mbufs in hold queue. * * Returns the number of held packets, if any, that were dropped. */ size_t lltable_drop_entry_queue(struct llentry *lle) { size_t pkts_dropped; struct mbuf *next; LLE_WLOCK_ASSERT(lle); pkts_dropped = 0; while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { next = lle->la_hold->m_nextpkt; m_freem(lle->la_hold); lle->la_hold = next; lle->la_numheld--; pkts_dropped++; } KASSERTMSG(lle->la_numheld == 0, "la_numheld %d > 0, pkts_dropped %zd", lle->la_numheld, pkts_dropped); return (pkts_dropped); } struct llentry * llentry_pool_get(int flags) { struct llentry *lle; lle = pool_get(&llentry_pool, flags); if (lle != NULL) memset(lle, 0, sizeof(*lle)); return lle; } void llentry_pool_put(struct llentry *lle) { pool_put(&llentry_pool, lle); } /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. * * Returns the number of held packets, if any, that were dropped. */ size_t llentry_free(struct llentry *lle) { struct lltable *llt; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); lle->la_flags |= LLE_DELETED; if ((lle->la_flags & LLE_LINKED) != 0) { llt = lle->lle_tbl; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); llt->llt_unlink_entry(lle); } /* * Stop a pending callout if one exists. If we cancel one, we have to * remove a reference to avoid a leak. callout_pending is required to * to exclude the case that the callout has never been scheduled. */ /* XXX once softnet_lock goes away, we should use callout_halt */ if (callout_pending(&lle->la_timer)) { bool expired = callout_stop(&lle->la_timer); if (!expired) LLE_REMREF(lle); } pkts_dropped = lltable_drop_entry_queue(lle); LLE_FREE_LOCKED(lle); return (pkts_dropped); } /* * (al)locate an llentry for address dst (equivalent to rtalloc for new-arp). * * If found the llentry * is returned referenced and unlocked. */ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { struct llentry *la; IF_AFDATA_RLOCK(ifp); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); IF_AFDATA_RUNLOCK(ifp); if ((la == NULL) && (ifp->if_flags & IFF_NOARP) == 0) { IF_AFDATA_WLOCK(ifp); la = lla_create(lt, 0, (struct sockaddr *)dst, NULL /* XXX */); IF_AFDATA_WUNLOCK(ifp); } if (la != NULL) { LLE_ADDREF(la); LLE_WUNLOCK(la); } return (la); } /* * Free all entries from given table and free itself. */ static int lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct llentries *dchain; dchain = (struct llentries *)farg; LLE_WLOCK(lle); LIST_INSERT_HEAD(dchain, lle, lle_chain); return (0); } /* * Free all entries from given table. */ void lltable_purge_entries(struct lltable *llt) { struct llentry *lle, *next; struct llentries dchain; KASSERTMSG(llt != NULL, "llt is NULL"); LIST_INIT(&dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push all lles to @dchain */ lltable_foreach_lle(llt, lltable_free_cb, &dchain); llentries_unlink(llt, &dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) (void)llentry_free(lle); } /* * Free all entries from given table and free itself. */ void lltable_free(struct lltable *llt) { KASSERTMSG(llt != NULL, "llt is NULL"); lltable_unlink(llt); lltable_purge_entries(llt); llt->llt_free_tbl(llt); } void lltable_drain(int af) { struct lltable *llt; struct llentry *lle; register int i; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af != af) continue; for (i=0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { LLE_WLOCK(lle); lltable_drop_entry_queue(lle); LLE_WUNLOCK(lle); } } } LLTABLE_RUNLOCK(); } void lltable_prefix_free(const int af, const struct sockaddr *prefix, const struct sockaddr *mask, const u_int flags) { struct lltable *llt; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af != af) continue; llt->llt_prefix_free(llt, prefix, mask, flags); } LLTABLE_RUNLOCK(); } struct lltable * lltable_allocate_htbl(uint32_t hsize) { struct lltable *llt; int i; llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO); llt->llt_hsize = hsize; llt->lle_head = malloc(sizeof(struct llentries) * hsize, M_LLTABLE, M_WAITOK | M_ZERO); for (i = 0; i < llt->llt_hsize; i++) LIST_INIT(&llt->lle_head[i]); /* Set some default callbacks */ llt->llt_link_entry = htable_link_entry; llt->llt_unlink_entry = htable_unlink_entry; llt->llt_prefix_free = htable_prefix_free; llt->llt_foreach_entry = htable_foreach_lle; llt->llt_free_tbl = htable_free_tbl; #ifdef MBUFTRACE llt->llt_mowner = NULL; #endif return (llt); } /* * Links lltable to global llt list. */ void lltable_link(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_INSERT_HEAD(&lltables, llt, llt_link); LLTABLE_WUNLOCK(); } static void lltable_unlink(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_REMOVE(&lltables, llt, lltable, llt_link); LLTABLE_WUNLOCK(); } /* * External methods used by lltable consumers */ int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { return (llt->llt_foreach_entry(llt, f, farg)); } void lltable_link_entry(struct lltable *llt, struct llentry *lle) { llt->llt_link_entry(llt, lle); } void lltable_unlink_entry(struct lltable *llt, struct llentry *lle) { llt->llt_unlink_entry(lle); } void lltable_free_entry(struct lltable *llt, struct llentry *lle) { llt->llt_free_entry(llt, lle); } void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct lltable *llt; llt = lle->lle_tbl; llt->llt_fill_sa_entry(lle, sa); } struct ifnet * lltable_get_ifp(const struct lltable *llt) { return (llt->llt_ifp); } int lltable_get_af(const struct lltable *llt) { return (llt->llt_af); } /* * Called in route_output when rtm_flags contains RTF_LLDATA. */ int lla_rt_output(const u_char rtm_type, const int rtm_flags, const time_t rtm_expire, struct rt_addrinfo *info, int sdl_index) { const struct sockaddr_dl *dl = satocsdl(info->rti_info[RTAX_GATEWAY]); const struct sockaddr *dst = info->rti_info[RTAX_DST]; struct ifnet *ifp; struct lltable *llt; struct llentry *lle; u_int laflags; int error; struct psref psref; int bound; KASSERTMSG(dl != NULL && dl->sdl_family == AF_LINK, "invalid dl"); bound = curlwp_bind(); if (sdl_index != 0) ifp = if_get_byindex(sdl_index, &psref); else ifp = if_get_byindex(dl->sdl_index, &psref); if (ifp == NULL) { curlwp_bindx(bound); log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", __func__, sdl_index != 0 ? sdl_index : dl->sdl_index); return EINVAL; } /* XXX linked list may be too expensive */ LLTABLE_RLOCK(); SLIST_FOREACH(llt, &lltables, llt_link) { if (llt->llt_af == dst->sa_family && llt->llt_ifp == ifp) break; } LLTABLE_RUNLOCK(); KASSERTMSG(llt != NULL, "Yep, ugly hacks are bad"); error = 0; switch (rtm_type) { case RTM_ADD: { struct rtentry *rt; /* Never call rtalloc1 with IF_AFDATA_WLOCK */ rt = rtalloc1(dst, 0); /* Add static LLE */ IF_AFDATA_WLOCK(ifp); lle = lla_lookup(llt, LLE_EXCLUSIVE, dst); /* Cannot overwrite an existing static entry */ if (lle != NULL && (lle->la_flags & LLE_STATIC || lle->la_expire == 0)) { LLE_RUNLOCK(lle); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); error = EEXIST; goto out; } /* * We can't overwrite an existing entry to avoid race * conditions so remove it first. */ if (lle != NULL) { #if defined(INET) && NARP > 0 size_t pkts_dropped = llentry_free(lle); if (dst->sa_family == AF_INET) { arp_stat_add(ARP_STAT_DFRDROPPED, (uint64_t)pkts_dropped); } #else (void) llentry_free(lle); #endif } lle = lla_create(llt, 0, dst, rt); if (lle == NULL) { IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); error = ENOMEM; goto out; } KASSERT(ifp->if_addrlen <= sizeof(lle->ll_addr)); memcpy(&lle->ll_addr, CLLADDR(dl), ifp->if_addrlen); if ((rtm_flags & RTF_ANNOUNCE)) lle->la_flags |= LLE_PUB; lle->la_flags |= LLE_VALID; switch (dst->sa_family) { #ifdef INET case AF_INET: lle->ln_state = ND_LLINFO_REACHABLE; break; #endif #ifdef INET6 case AF_INET6: lle->ln_state = ND_LLINFO_REACHABLE; break; #endif } /* * NB: arp and ndp always set (RTF_STATIC | RTF_HOST) */ if (rtm_expire == 0) { lle->la_flags |= LLE_STATIC; lle->la_expire = 0; } else lle->la_expire = rtm_expire; laflags = lle->la_flags; LLE_WUNLOCK(lle); IF_AFDATA_WUNLOCK(ifp); if (rt != NULL) rt_unref(rt); #if defined(INET) && NARP > 0 /* gratuitous ARP */ if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) { const struct sockaddr_in *sin; struct in_ifaddr *ia; struct psref _psref; sin = satocsin(dst); ia = in_get_ia_on_iface_psref(sin->sin_addr, ifp, &_psref); if (ia != NULL) { arpannounce(ifp, &ia->ia_ifa, CLLADDR(dl)); ia4_release(ia, &_psref); } } #else (void)laflags; #endif break; } case RTM_DELETE: IF_AFDATA_WLOCK(ifp); error = lla_delete(llt, 0, dst); IF_AFDATA_WUNLOCK(ifp); error = (error == 0 ? 0 : ENOENT); break; default: error = EINVAL; } out: if_put(ifp, &psref); curlwp_bindx(bound); return (error); } void lltableinit(void) { SLIST_INIT(&lltables); rw_init(&lltable_rwlock); pool_init(&llentry_pool, sizeof(struct llentry), 0, 0, 0, "llentrypl", NULL, IPL_SOFTNET); } #ifdef __FreeBSD__ #ifdef DDB struct llentry_sa { struct llentry base; struct sockaddr l3_addr; }; static void llatbl_lle_show(struct llentry_sa *la) { struct llentry *lle; uint8_t octet[6]; lle = &la->base; db_printf("lle=%p\n", lle); db_printf(" lle_next=%p\n", lle->lle_next.le_next); db_printf(" lle_lock=%p\n", &lle->lle_lock); db_printf(" lle_tbl=%p\n", lle->lle_tbl); db_printf(" lle_head=%p\n", lle->lle_head); db_printf(" la_hold=%p\n", lle->la_hold); db_printf(" la_numheld=%d\n", lle->la_numheld); db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); db_printf(" la_preempt=%u\n", lle->la_preempt); db_printf(" ln_byhint=%u\n", lle->ln_byhint); db_printf(" ln_state=%d\n", lle->ln_state); db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); memcopy(octet, &lle->ll_addr.mac16, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); db_printf(" lle_timer=%p\n", &lle->lle_timer); switch (la->l3_addr.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; char l3s[INET_ADDRSTRLEN]; sin = (struct sockaddr_in *)&la->l3_addr; inet_ntoa_r(sin->sin_addr, l3s); db_printf(" l3_addr=%s\n", l3s); break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; char l3s[INET6_ADDRSTRLEN]; sin6 = (struct sockaddr_in6 *)&la->l3_addr; IN6_PRINT(l3s, &sin6->sin6_addr); db_printf(" l3_addr=%s\n", l3s); break; } #endif default: db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family); break; } } DB_SHOW_COMMAND(llentry, db_show_llentry) { if (!have_addr) { db_printf("usage: show llentry <struct llentry *>\n"); return; } llatbl_lle_show((struct llentry_sa *)addr); } static void llatbl_llt_show(struct lltable *llt) { int i; struct llentry *lle; db_printf("llt=%p llt_af=%d llt_ifp=%p\n", llt, llt->llt_af, llt->llt_ifp); for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { llatbl_lle_show((struct llentry_sa *)lle); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(lltable, db_show_lltable) { if (!have_addr) { db_printf("usage: show lltable <struct lltable *>\n"); return; } llatbl_llt_show((struct lltable *)addr); } DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables) { VNET_ITERATOR_DECL(vnet_iter); struct lltable *llt; VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); #ifdef VIMAGE db_printf("vnet=%p\n", curvnet); #endif SLIST_FOREACH(llt, &lltables, llt_link) { db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n", llt, llt->llt_af, llt->llt_ifp, (llt->llt_ifp != NULL) ? llt->llt_ifp->if_xname : "?"); if (have_addr && addr != 0) /* verbose */ llatbl_llt_show(llt); if (db_pager_quit) { CURVNET_RESTORE(); return; } } CURVNET_RESTORE(); } } #endif /* DDB */ #endif /* __FreeBSD__ */
1227 1 1 1 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 /* $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $ */ /*- * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center. * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $"); #ifdef _KERNEL_OPT #include "opt_dtrace.h" #include "opt_gprof.h" #include "opt_multiprocessor.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/callout.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/sysctl.h> #include <sys/timex.h> #include <sys/sched.h> #include <sys/time.h> #include <sys/timetc.h> #include <sys/cpu.h> #include <sys/atomic.h> #include <sys/rndsource.h> #include <sys/heartbeat.h> #ifdef GPROF #include <sys/gmon.h> #endif #ifdef KDTRACE_HOOKS #include <sys/dtrace_bsd.h> #include <sys/cpu.h> cyclic_clock_func_t cyclic_clock_func[MAXCPUS]; #endif static int sysctl_kern_clockrate(SYSCTLFN_PROTO); /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. The main clock, running hz times per second, is used to keep * track of real time. The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the CPU * just before its quantum expires. Otherwise, it would never accumulate * CPU ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) */ int stathz; int profhz; int profsrc; int schedhz; int profprocs; static int hardclock_ticks; static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */ static int psdiv; /* prof => stat divider */ int psratio; /* ratio: prof / stat */ struct clockrnd { struct krndsource source; unsigned needed; }; static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT); static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT); static void clockrnd_get(size_t needed, void *cookie) { struct clockrnd *C = cookie; /* Start sampling. */ atomic_store_relaxed(&C->needed, 2*NBBY*needed); } static void clockrnd_sample(struct clockrnd *C) { struct cpu_info *ci = curcpu(); /* If there's nothing needed right now, stop here. */ if (__predict_true(atomic_load_relaxed(&C->needed) == 0)) return; /* * If we're not the primary core of a package, we're probably * driven by the same clock as the primary core, so don't * bother. */ if (ci != ci->ci_package1st) return; /* Take a sample and enter it into the pool. */ rnd_add_uint32(&C->source, 0); /* * On the primary CPU, count down. Using an atomic decrement * here isn't really necessary -- on every platform we care * about, stores to unsigned int are atomic, and the only other * memory operation that could happen here is for another CPU * to store a higher value for needed. But using an atomic * decrement avoids giving the impression of data races, and is * unlikely to hurt because only one CPU will ever be writing * to the location. */ if (CPU_IS_PRIMARY(curcpu())) { unsigned needed __diagused; needed = atomic_dec_uint_nv(&C->needed); KASSERT(needed != UINT_MAX); } } static u_int get_intr_timecount(struct timecounter *); static struct timecounter intr_timecounter = { .tc_get_timecount = get_intr_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0u, .tc_frequency = 0, .tc_name = "clockinterrupt", /* quality - minimum implementation level for a clock */ .tc_quality = 0, .tc_priv = NULL, }; static u_int get_intr_timecount(struct timecounter *tc) { return (u_int)getticks(); } int getticks(void) { return atomic_load_relaxed(&hardclock_ticks); } /* * Initialize clock frequencies and start both clocks running. */ void initclocks(void) { static struct sysctllog *clog; int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ psdiv = 1; /* * Call cpu_initclocks() before registering the default * timecounter, in case it needs to adjust hz. */ const int old_hz = hz; cpu_initclocks(); if (old_hz != hz) { tick = 1000000 / hz; tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1; } /* * provide minimum default time counter * will only run at interrupt resolution */ intr_timecounter.tc_frequency = hz; tc_init(&intr_timecounter); /* * Compute profhz and stathz, fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; if (schedhz == 0) { /* 16Hz is best */ hardscheddiv = hz / 16; if (hardscheddiv <= 0) panic("hardscheddiv"); } sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_STRUCT, "clockrate", SYSCTL_DESCR("Kernel clock rates"), sysctl_kern_clockrate, 0, NULL, sizeof(struct clockinfo), CTL_KERN, KERN_CLOCKRATE, CTL_EOL); sysctl_createv(&clog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_INT, "hardclock_ticks", SYSCTL_DESCR("Number of hardclock ticks"), NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks), CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL); rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd); rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW, RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB); if (stathz) { rndsource_setcb(&statclockrnd.source, clockrnd_get, &statclockrnd); rnd_attach_source(&statclockrnd.source, "statclock", RND_TYPE_SKEW, (RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME| RND_FLAG_HASCB)); } } /* * The real-time timer, interrupting hz times per second. */ void hardclock(struct clockframe *frame) { struct lwp *l; struct cpu_info *ci; clockrnd_sample(&hardclockrnd); ci = curcpu(); l = ci->ci_onproc; ptimer_tick(l, CLKF_USERMODE(frame)); /* * If no separate statistics clock is available, run it from here. */ if (stathz == 0) statclock(frame); /* * If no separate schedclock is provided, call it here * at about 16 Hz. */ if (schedhz == 0) { if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) { schedclock(l); ci->ci_schedstate.spc_schedticks = hardscheddiv; } } if ((--ci->ci_schedstate.spc_ticks) <= 0) sched_tick(ci); if (CPU_IS_PRIMARY(ci)) { atomic_store_relaxed(&hardclock_ticks, atomic_load_relaxed(&hardclock_ticks) + 1); tc_ticktock(); } /* * Make sure the CPUs and timecounter are making progress. */ heartbeat(); /* * Update real-time timeout queue. */ callout_hardclock(); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(struct proc *p) { KASSERT(mutex_owned(&p->p_stmutex)); if ((p->p_stflag & PST_PROFIL) == 0) { p->p_stflag |= PST_PROFIL; /* * This is only necessary if using the clock as the * profiling source. */ if (++profprocs == 1 && stathz != 0) psdiv = psratio; } } /* * Stop profiling on a process. */ void stopprofclock(struct proc *p) { KASSERT(mutex_owned(&p->p_stmutex)); if (p->p_stflag & PST_PROFIL) { p->p_stflag &= ~PST_PROFIL; /* * This is only necessary if using the clock as the * profiling source. */ if (--profprocs == 0 && stathz != 0) psdiv = 1; } } void schedclock(struct lwp *l) { if ((l->l_flag & LW_IDLE) != 0) return; sched_schedclock(l); } /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. */ void statclock(struct clockframe *frame) { #ifdef GPROF struct gmonparam *g; intptr_t i; #endif struct cpu_info *ci = curcpu(); struct schedstate_percpu *spc = &ci->ci_schedstate; struct proc *p; struct lwp *l; if (stathz) clockrnd_sample(&statclockrnd); /* * Notice changes in divisor frequency, and adjust clock * frequency accordingly. */ if (spc->spc_psdiv != psdiv) { spc->spc_psdiv = psdiv; spc->spc_pscnt = psdiv; if (psdiv == 1) { setstatclockrate(stathz); } else { setstatclockrate(profhz); } } l = ci->ci_onproc; if ((l->l_flag & LW_IDLE) != 0) { /* * don't account idle lwps as swapper. */ p = NULL; } else { p = l->l_proc; mutex_spin_enter(&p->p_stmutex); } if (CLKF_USERMODE(frame)) { KASSERT(p != NULL); if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK) addupc_intr(l, CLKF_PC(frame)); if (--spc->spc_pscnt > 0) { mutex_spin_exit(&p->p_stmutex); return; } /* * Came from user mode; CPU was in user state. * If this process is being profiled record the tick. */ p->p_uticks++; if (p->p_nice > NZERO) spc->spc_cp_time[CP_NICE]++; else spc->spc_cp_time[CP_USER]++; } else { #ifdef GPROF /* * Kernel statistics are just like addupc_intr, only easier. */ #if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL) g = curcpu()->ci_gmon; if (g != NULL && profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { #else g = &_gmonparam; if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) { #endif i = CLKF_PC(frame) - g->lowpc; if (i < g->textsize) { i /= HISTFRACTION * sizeof(*g->kcount); g->kcount[i]++; } } #endif #ifdef LWP_PC if (p != NULL && profsrc == PROFSRC_CLOCK && (p->p_stflag & PST_PROFIL)) { addupc_intr(l, LWP_PC(l)); } #endif if (--spc->spc_pscnt > 0) { if (p != NULL) mutex_spin_exit(&p->p_stmutex); return; } /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) { if (p != NULL) { p->p_iticks++; } spc->spc_cp_time[CP_INTR]++; } else if (p != NULL) { p->p_sticks++; spc->spc_cp_time[CP_SYS]++; } else { spc->spc_cp_time[CP_IDLE]++; } } spc->spc_pscnt = psdiv; if (p != NULL) { atomic_inc_uint(&l->l_cpticks); mutex_spin_exit(&p->p_stmutex); } #ifdef KDTRACE_HOOKS cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)]; if (func) { (*func)((struct clockframe *)frame); } #endif } /* * sysctl helper routine for kern.clockrate. Assembles a struct on * the fly to be returned to the caller. */ static int sysctl_kern_clockrate(SYSCTLFN_ARGS) { struct clockinfo clkinfo; struct sysctlnode node; clkinfo.tick = tick; clkinfo.tickadj = tickadj; clkinfo.hz = hz; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; node = *rnode; node.sysctl_data = &clkinfo; return (sysctl_lookup(SYSCTLFN_CALL(&node))); }
556 590 557 556 242 555 556 242 242 517 250 250 250 127 127 127 26 112 115 26 126 25 112 127 8 7 7 8 322 352 353 112 67 145 21 45 124 156 157 112 7 7 7 7 7 7 7 7 7 7 7 7 7 510 515 221 511 513 318 321 220 148 327 328 329 329 5 4 1 5 329 82 301 330 329 330 3 4 137 10 220 327 328 327 329 328 1 327 16 320 17 18 18 329 329 328 1 330 22 322 503 501 221 530 532 79 79 79 29 29 29 6 6 6 5 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 223 258 257 108 1 20 112 22 22 22 22 21 14 8 21 363 442 444 236 235 233 236 222 223 222 216 67 67 67 67 52 20 63 63 63 4 59 59 59 1 63 23 24 24 24 24 24 23 4 20 24 24 16 7 23 24 24 24 4 20 20 20 23 23 3 1 2 2 2 310 310 300 24 310 310 51 51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 /* $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $ */ /*- * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 */ /* * The vnode cache subsystem. * * Life-cycle * * Normally, there are two points where new vnodes are created: * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode * starts in one of the following ways: * * - Allocation, via vcache_get(9) or vcache_new(9). * - Reclamation of inactive vnode, via vcache_vget(9). * * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) * was another, traditional way. Currently, only the draining thread * recycles the vnodes. This behaviour might be revisited. * * The life-cycle ends when the last reference is dropped, usually * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform * the file system that vnode is inactive. Via this call, file system * indicates whether vnode can be recycled (usually, it checks its own * references, e.g. count of links, whether the file was removed). * * Depending on indication, vnode can be put into a free list (cache), * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to * disassociate underlying file system from the vnode, and finally * destroyed. * * Vnode state * * Vnode is always in one of six states: * - MARKER This is a marker vnode to help list traversal. It * will never change its state. * - LOADING Vnode is associating underlying file system and not * yet ready to use. * - LOADED Vnode has associated underlying file system and is * ready to use. * - BLOCKED Vnode is active but cannot get new references. * - RECLAIMING Vnode is disassociating from the underlying file * system. * - RECLAIMED Vnode has disassociated from underlying file system * and is dead. * * Valid state changes are: * LOADING -> LOADED * Vnode has been initialised in vcache_get() or * vcache_new() and is ready to use. * BLOCKED -> RECLAIMING * Vnode starts disassociation from underlying file * system in vcache_reclaim(). * RECLAIMING -> RECLAIMED * Vnode finished disassociation from underlying file * system in vcache_reclaim(). * LOADED -> BLOCKED * Either vcache_rekey*() is changing the vnode key or * vrelel() is about to call VOP_INACTIVE(). * BLOCKED -> LOADED * The block condition is over. * LOADING -> RECLAIMED * Either vcache_get() or vcache_new() failed to * associate the underlying file system or vcache_rekey*() * drops a vnode used as placeholder. * * Of these states LOADING, BLOCKED and RECLAIMING are intermediate * and it is possible to wait for state change. * * State is protected with v_interlock with one exception: * to change from LOADING both v_interlock and vcache_lock must be held * so it is possible to check "state == LOADING" without holding * v_interlock. See vcache_get() for details. * * Reference counting * * Vnode is considered active, if reference count (vnode_t::v_usecount) * is non-zero. It is maintained using: vref(9) and vrele(9), as well * as vput(9), routines. Common points holding references are e.g. * file openings, current working directory, mount points, etc. * * v_usecount is adjusted with atomic operations, however to change * from a non-zero value to zero the interlock must also be held. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $"); #ifdef _KERNEL_OPT #include "opt_pax.h" #endif #include <sys/param.h> #include <sys/kernel.h> #include <sys/atomic.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/hash.h> #include <sys/kauth.h> #include <sys/kmem.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/namei.h> #include <sys/pax.h> #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/threadpool.h> #include <sys/vnode_impl.h> #include <sys/wapbl.h> #include <sys/fstrans.h> #include <miscfs/deadfs/deadfs.h> #include <miscfs/specfs/specdev.h> #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> #include <uvm/uvm_stat.h> /* Flags to vrelel. */ #define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */ #define LRU_VRELE 0 #define LRU_FREE 1 #define LRU_HOLD 2 #define LRU_COUNT 3 /* * There are three lru lists: one holds vnodes waiting for async release, * one is for vnodes which have no buffer/page references and one for those * which do (i.e. v_holdcnt is non-zero). We put the lists into a single, * private cache line as vnodes migrate between them while under the same * lock (vdrain_lock). */ typedef struct { vnode_impl_t *li_marker; } lru_iter_t; u_int numvnodes __cacheline_aligned; static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; static struct threadpool *threadpool; static struct threadpool_job vdrain_job; static struct threadpool_job vrele_job; static kmutex_t vdrain_lock __cacheline_aligned; SLIST_HEAD(hashhead, vnode_impl); static kmutex_t vcache_lock __cacheline_aligned; static kcondvar_t vcache_cv; static u_int vcache_hashsize; static u_long vcache_hashmask; static struct hashhead *vcache_hashtab; static pool_cache_t vcache_pool; static void lru_requeue(vnode_t *, vnodelst_t *); static vnodelst_t * lru_which(vnode_t *); static vnode_impl_t * lru_iter_first(int, lru_iter_t *); static vnode_impl_t * lru_iter_next(lru_iter_t *); static void lru_iter_release(lru_iter_t *); static vnode_impl_t * vcache_alloc(void); static void vcache_dealloc(vnode_impl_t *); static void vcache_free(vnode_impl_t *); static void vcache_init(void); static void vcache_reinit(void); static void vcache_reclaim(vnode_t *); static void vrele_deferred(vnode_impl_t *); static void vrelel(vnode_t *, int, int); static void vnpanic(vnode_t *, const char *, ...) __printflike(2, 3); static bool vdrain_one(u_int); static void vdrain_task(struct threadpool_job *); static void vrele_task(struct threadpool_job *); /* Routines having to do with the management of the vnode table. */ /* * The high bit of v_usecount is a gate for vcache_tryvget(). It's set * only when the vnode state is LOADED. * The next bit of v_usecount is a flag for vrelel(). It's set * from vcache_vget() and vcache_tryvget() whenever the operation succeeds. */ #define VUSECOUNT_MASK 0x3fffffff #define VUSECOUNT_GATE 0x80000000 #define VUSECOUNT_VGET 0x40000000 /* * Return the current usecount of a vnode. */ inline int vrefcnt(struct vnode *vp) { return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK; } /* Vnode state operations and diagnostics. */ #if defined(DIAGNOSTIC) #define VSTATE_VALID(state) \ ((state) != VS_ACTIVE && (state) != VS_MARKER) #define VSTATE_GET(vp) \ vstate_assert_get((vp), __func__, __LINE__) #define VSTATE_CHANGE(vp, from, to) \ vstate_assert_change((vp), (from), (to), __func__, __LINE__) #define VSTATE_WAIT_STABLE(vp) \ vstate_assert_wait_stable((vp), __func__, __LINE__) void _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, bool has_lock) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); int refcnt = vrefcnt(vp); if (!has_lock) { enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state); if (state == VS_ACTIVE && refcnt > 0 && (vstate == VS_LOADED || vstate == VS_BLOCKED)) return; if (vstate == state) return; mutex_enter((vp)->v_interlock); } KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if ((state == VS_ACTIVE && refcnt > 0 && (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) || vip->vi_state == state) { if (!has_lock) mutex_exit((vp)->v_interlock); return; } vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d", vstate_name(vip->vi_state), refcnt, vstate_name(state), func, line); } static enum vnode_state vstate_assert_get(vnode_t *vp, const char *func, int line) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); return vip->vi_state; } static void vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock); if (! VSTATE_VALID(vip->vi_state)) vnpanic(vp, "state is %s at %s:%d", vstate_name(vip->vi_state), func, line); } static void vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, const char *func, int line) { bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE); vnode_impl_t *vip = VNODE_TO_VIMPL(vp); KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (from == VS_LOADING) KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); if (! VSTATE_VALID(from)) vnpanic(vp, "from is %s at %s:%d", vstate_name(from), func, line); if (! VSTATE_VALID(to)) vnpanic(vp, "to is %s at %s:%d", vstate_name(to), func, line); if (vip->vi_state != from) vnpanic(vp, "from is %s, expected %s at %s:%d\n", vstate_name(vip->vi_state), vstate_name(from), func, line); if ((from == VS_LOADED) != gated) vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n", vstate_name(vip->vi_state), gated, func, line); /* Open/close the gate for vcache_tryvget(). */ if (to == VS_LOADED) { membar_release(); atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); } else { atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); } atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED) cv_broadcast(&vp->v_cv); } #else /* defined(DIAGNOSTIC) */ #define VSTATE_GET(vp) \ (VNODE_TO_VIMPL((vp))->vi_state) #define VSTATE_CHANGE(vp, from, to) \ vstate_change((vp), (from), (to)) #define VSTATE_WAIT_STABLE(vp) \ vstate_wait_stable((vp)) void _vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line, bool has_lock) { } static void vstate_wait_stable(vnode_t *vp) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock); } static void vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); /* Open/close the gate for vcache_tryvget(). */ if (to == VS_LOADED) { membar_release(); atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE); } else { atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE); } atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED) cv_broadcast(&vp->v_cv); } #endif /* defined(DIAGNOSTIC) */ void vfs_vnode_sysinit(void) { int error __diagused, i; dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); KASSERT(dead_rootmount != NULL); dead_rootmount->mnt_iflag |= IMNT_MPSAFE; mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); for (i = 0; i < LRU_COUNT; i++) { TAILQ_INIT(&lru_list[i]); } vcache_init(); error = threadpool_get(&threadpool, PRI_NONE); KASSERTMSG((error == 0), "threadpool_get failed: %d", error); threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain"); threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele"); } /* * Allocate a new marker vnode. */ vnode_t * vnalloc_marker(struct mount *mp) { vnode_impl_t *vip; vnode_t *vp; vip = pool_cache_get(vcache_pool, PR_WAITOK); memset(vip, 0, sizeof(*vip)); vp = VIMPL_TO_VNODE(vip); uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); vp->v_mount = mp; vp->v_type = VBAD; vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); klist_init(&vip->vi_klist.vk_klist); vp->v_klist = &vip->vi_klist; vip->vi_state = VS_MARKER; return vp; } /* * Free a marker vnode. */ void vnfree_marker(vnode_t *vp) { vnode_impl_t *vip; vip = VNODE_TO_VIMPL(vp); KASSERT(vip->vi_state == VS_MARKER); mutex_obj_free(vp->v_interlock); uvm_obj_destroy(&vp->v_uobj, true); klist_fini(&vip->vi_klist.vk_klist); pool_cache_put(vcache_pool, vip); } /* * Test a vnode for being a marker vnode. */ bool vnis_marker(vnode_t *vp) { return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); } /* * Return the lru list this node should be on. */ static vnodelst_t * lru_which(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt > 0) return &lru_list[LRU_HOLD]; else return &lru_list[LRU_FREE]; } /* * Put vnode to end of given list. * Both the current and the new list may be NULL, used on vnode alloc/free. * Adjust numvnodes and signal vdrain thread if there is work. */ static void lru_requeue(vnode_t *vp, vnodelst_t *listhd) { vnode_impl_t *vip; int d; /* * If the vnode is on the correct list, and was put there recently, * then leave it be, thus avoiding huge cache and lock contention. */ vip = VNODE_TO_VIMPL(vp); if (listhd == vip->vi_lrulisthd && (getticks() - vip->vi_lrulisttm) < hz) { return; } mutex_enter(&vdrain_lock); d = 0; if (vip->vi_lrulisthd != NULL) TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); else d++; vip->vi_lrulisthd = listhd; vip->vi_lrulisttm = getticks(); if (vip->vi_lrulisthd != NULL) TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); else d--; if (d != 0) { /* * Looks strange? This is not a bug. Don't store * numvnodes unless there is a change - avoid false * sharing on MP. */ numvnodes += d; } if (listhd == &lru_list[LRU_VRELE]) threadpool_schedule_job(threadpool, &vrele_job); if (d > 0 && numvnodes > desiredvnodes) threadpool_schedule_job(threadpool, &vdrain_job); if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock); mutex_exit(&vdrain_lock); } /* * LRU list iterator. * Caller holds vdrain_lock. */ static vnode_impl_t * lru_iter_first(int idx, lru_iter_t *iterp) { vnode_impl_t *marker; KASSERT(mutex_owned(&vdrain_lock)); mutex_exit(&vdrain_lock); marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); mutex_enter(&vdrain_lock); marker->vi_lrulisthd = &lru_list[idx]; iterp->li_marker = marker; TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist); return lru_iter_next(iterp); } static vnode_impl_t * lru_iter_next(lru_iter_t *iter) { vnode_impl_t *vip, *marker; vnodelst_t *listhd; KASSERT(mutex_owned(&vdrain_lock)); marker = iter->li_marker; listhd = marker->vi_lrulisthd; while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { TAILQ_REMOVE(listhd, marker, vi_lrulist); TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist); if (!vnis_marker(VIMPL_TO_VNODE(vip))) break; } return vip; } static void lru_iter_release(lru_iter_t *iter) { vnode_impl_t *marker; KASSERT(mutex_owned(&vdrain_lock)); marker = iter->li_marker; TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist); mutex_exit(&vdrain_lock); vnfree_marker(VIMPL_TO_VNODE(marker)); mutex_enter(&vdrain_lock); } /* * Release deferred vrele vnodes for this mount. * Called with file system suspended. */ void vrele_flush(struct mount *mp) { lru_iter_t iter; vnode_impl_t *vip; KASSERT(fstrans_is_owner(mp)); mutex_enter(&vdrain_lock); for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL; vip = lru_iter_next(&iter)) { if (VIMPL_TO_VNODE(vip)->v_mount != mp) continue; vrele_deferred(vip); } lru_iter_release(&iter); mutex_exit(&vdrain_lock); } /* * One pass through the LRU lists to keep the number of allocated * vnodes below target. Returns true if target met. */ static bool vdrain_one(u_int target) { int ix, lists[] = { LRU_FREE, LRU_HOLD }; lru_iter_t iter; vnode_impl_t *vip; vnode_t *vp; struct mount *mp; KASSERT(mutex_owned(&vdrain_lock)); for (ix = 0; ix < __arraycount(lists); ix++) { for (vip = lru_iter_first(lists[ix], &iter); vip != NULL; vip = lru_iter_next(&iter)) { if (numvnodes < target) { lru_iter_release(&iter); return true; } vp = VIMPL_TO_VNODE(vip); /* Probe usecount (unlocked). */ if (vrefcnt(vp) > 0) continue; /* Try v_interlock -- we lock the wrong direction! */ if (!mutex_tryenter(vp->v_interlock)) continue; /* Probe usecount and state. */ if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) { mutex_exit(vp->v_interlock); continue; } mutex_exit(&vdrain_lock); mp = vp->v_mount; if (fstrans_start_nowait(mp) != 0) { mutex_exit(vp->v_interlock); mutex_enter(&vdrain_lock); continue; } if (vcache_vget(vp) == 0) { if (!vrecycle(vp)) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_EXCLUSIVE); } } fstrans_done(mp); mutex_enter(&vdrain_lock); } lru_iter_release(&iter); } return false; } /* * threadpool task to keep the number of vnodes below desiredvnodes. */ static void vdrain_task(struct threadpool_job *job) { u_int target; target = desiredvnodes - desiredvnodes / 16; mutex_enter(&vdrain_lock); while (!vdrain_one(target)) kpause("vdrain", false, 1, &vdrain_lock); threadpool_job_done(job); mutex_exit(&vdrain_lock); } /* * threadpool task to process asynchronous vrele. */ static void vrele_task(struct threadpool_job *job) { int skipped; lru_iter_t iter; vnode_impl_t *vip; struct mount *mp; mutex_enter(&vdrain_lock); while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) { for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) { mp = VIMPL_TO_VNODE(vip)->v_mount; if (fstrans_start_nowait(mp) == 0) { vrele_deferred(vip); fstrans_done(mp); } else { skipped++; } } lru_iter_release(&iter); if (skipped) kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock); } threadpool_job_done(job); lru_iter_release(&iter); mutex_exit(&vdrain_lock); } /* * Try to drop reference on a vnode. Abort if we are releasing the * last reference. Note: this _must_ succeed if not the last reference. */ static bool vtryrele(vnode_t *vp) { u_int use, next; membar_release(); for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) == 1)) { return false; } KASSERT((use & VUSECOUNT_MASK) > 1); next = atomic_cas_uint(&vp->v_usecount, use, use - 1); if (__predict_true(next == use)) { return true; } } } /* * vput: unlock and release the reference. */ void vput(vnode_t *vp) { int lktype; /* * Do an unlocked check of the usecount. If it looks like we're not * about to drop the last reference, then unlock the vnode and try * to drop the reference. If it ends up being the last reference * after all, vrelel() can fix it all up. Most of the time this * will all go to plan. */ if (vrefcnt(vp) > 1) { VOP_UNLOCK(vp); if (vtryrele(vp)) { return; } lktype = LK_NONE; } else { lktype = VOP_ISLOCKED(vp); KASSERT(lktype != LK_NONE); } mutex_enter(vp->v_interlock); vrelel(vp, 0, lktype); } /* * Release a vnode from the deferred list. */ static void vrele_deferred(vnode_impl_t *vip) { vnode_t *vp; KASSERT(mutex_owned(&vdrain_lock)); KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); vp = VIMPL_TO_VNODE(vip); /* * First remove the vnode from the vrele list. * Put it on the last lru list, the last vrele() * will put it back onto the right list before * its usecount reaches zero. */ TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); vip->vi_lrulisthd = &lru_list[LRU_HOLD]; vip->vi_lrulisttm = getticks(); TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); mutex_exit(&vdrain_lock); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_EXCLUSIVE); mutex_enter(&vdrain_lock); } /* * Vnode release. If reference count drops to zero, call inactive * routine and either return to freelist or free to the pool. */ static void vrelel(vnode_t *vp, int flags, int lktype) { const bool async = ((flags & VRELEL_ASYNC) != 0); bool recycle, defer, objlock_held; u_int use, next; int error; objlock_held = false; retry: KASSERT(mutex_owned(vp->v_interlock)); if (__predict_false(vp->v_op == dead_vnodeop_p && VSTATE_GET(vp) != VS_RECLAIMED)) { vnpanic(vp, "dead but not clean"); } /* * If not the last reference, just unlock and drop the reference count. * * Otherwise make sure we pass a point in time where we hold the * last reference with VGET flag unset. */ for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) > 1)) { if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); lktype = LK_NONE; VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } if (vtryrele(vp)) { mutex_exit(vp->v_interlock); return; } next = atomic_load_relaxed(&vp->v_usecount); continue; } KASSERT((use & VUSECOUNT_MASK) == 1); next = use & ~VUSECOUNT_VGET; if (next != use) { next = atomic_cas_uint(&vp->v_usecount, use, next); } if (__predict_true(next == use)) { break; } } membar_acquire(); if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) { vnpanic(vp, "%s: bad ref count", __func__); } #ifdef DIAGNOSTIC if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { vprint("vrelel: missing VOP_CLOSE()", vp); } #endif /* * If already clean there is no need to lock, defer or * deactivate this node. */ if (VSTATE_GET(vp) == VS_RECLAIMED) { if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); lktype = LK_NONE; VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } goto out; } /* * First try to get the vnode locked for VOP_INACTIVE(). * Defer vnode release to vrele task if caller requests * it explicitly, is the pagedaemon or the lock failed. */ defer = false; if ((curlwp == uvm.pagedaemon_lwp) || async) { defer = true; } else if (lktype == LK_SHARED) { /* Excellent chance of getting, if the last ref. */ error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT); if (error != 0) { defer = true; } else { lktype = LK_EXCLUSIVE; } } else if (lktype == LK_NONE) { /* Excellent chance of getting, if the last ref. */ error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); if (error != 0) { defer = true; } else { lktype = LK_EXCLUSIVE; } } KASSERT(mutex_owned(vp->v_interlock)); if (defer) { /* * Defer reclaim to the vrele task; it's not safe to * clean it here. We donate it our last reference. */ if (lktype != LK_NONE) { mutex_exit(vp->v_interlock); VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); } lru_requeue(vp, &lru_list[LRU_VRELE]); mutex_exit(vp->v_interlock); return; } KASSERT(lktype == LK_EXCLUSIVE); /* If the node gained another reference, retry. */ use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) { goto retry; } KASSERT((use & VUSECOUNT_MASK) == 1); if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 || (vp->v_vflag & VV_MAPPED) != 0) { /* Take care of space accounting. */ if (!objlock_held) { objlock_held = true; if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) { mutex_exit(vp->v_interlock); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); goto retry; } } if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); } vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); vp->v_vflag &= ~VV_MAPPED; } if (objlock_held) { objlock_held = false; rw_exit(vp->v_uobj.vmobjlock); } /* * Deactivate the vnode, but preserve our reference across * the call to VOP_INACTIVE(). * * If VOP_INACTIVE() indicates that the file has been * deleted, then recycle the vnode. * * Note that VOP_INACTIVE() will not drop the vnode lock. */ mutex_exit(vp->v_interlock); recycle = false; VOP_INACTIVE(vp, &recycle); if (!recycle) { lktype = LK_NONE; VOP_UNLOCK(vp); } mutex_enter(vp->v_interlock); /* * Block new references then check again to see if a * new reference was acquired in the meantime. If * it was, restore the vnode state and try again. */ if (recycle) { VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); goto retry; } KASSERT((use & VUSECOUNT_MASK) == 1); } /* * Recycle the vnode if the file is now unused (unlinked). */ if (recycle) { VSTATE_ASSERT(vp, VS_BLOCKED); KASSERT(lktype == LK_EXCLUSIVE); /* vcache_reclaim drops the lock. */ lktype = LK_NONE; vcache_reclaim(vp); } KASSERT(vrefcnt(vp) > 0); KASSERT(lktype == LK_NONE); out: for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_VGET) != 0 && (use & VUSECOUNT_MASK) == 1)) { /* Gained and released another reference, retry. */ goto retry; } next = atomic_cas_uint(&vp->v_usecount, use, use - 1); if (__predict_true(next == use)) { if (__predict_false((use & VUSECOUNT_MASK) != 1)) { /* Gained another reference. */ mutex_exit(vp->v_interlock); return; } break; } } membar_acquire(); if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { /* * It's clean so destroy it. It isn't referenced * anywhere since it has been reclaimed. */ vcache_free(VNODE_TO_VIMPL(vp)); } else { /* * Otherwise, put it back onto the freelist. It * can't be destroyed while still associated with * a file system. */ lru_requeue(vp, lru_which(vp)); mutex_exit(vp->v_interlock); } } void vrele(vnode_t *vp) { if (vtryrele(vp)) { return; } mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_NONE); } /* * Asynchronous vnode release, vnode is released in different context. */ void vrele_async(vnode_t *vp) { if (vtryrele(vp)) { return; } mutex_enter(vp->v_interlock); vrelel(vp, VRELEL_ASYNC, LK_NONE); } /* * Vnode reference, where a reference is already held by some other * object (for example, a file structure). * * NB: lockless code sequences may rely on this not blocking. */ void vref(vnode_t *vp) { KASSERT(vrefcnt(vp) > 0); atomic_inc_uint(&vp->v_usecount); } /* * Page or buffer structure gets a reference. * Called with v_interlock held. */ void vholdl(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp)); } /* * Page or buffer structure gets a reference. */ void vhold(vnode_t *vp) { mutex_enter(vp->v_interlock); vholdl(vp); mutex_exit(vp->v_interlock); } /* * Page or buffer structure frees a reference. * Called with v_interlock held. */ void holdrelel(vnode_t *vp) { KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt <= 0) { vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); } vp->v_holdcnt--; if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp)); } /* * Page or buffer structure frees a reference. */ void holdrele(vnode_t *vp) { mutex_enter(vp->v_interlock); holdrelel(vp); mutex_exit(vp->v_interlock); } /* * Recycle an unused vnode if caller holds the last reference. */ bool vrecycle(vnode_t *vp) { int error __diagused; mutex_enter(vp->v_interlock); /* If the vnode is already clean we're done. */ VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) != VS_LOADED) { VSTATE_ASSERT(vp, VS_RECLAIMED); vrelel(vp, 0, LK_NONE); return true; } /* Prevent further references until the vnode is locked. */ VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); /* Make sure we hold the last reference. */ if (vrefcnt(vp) != 1) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); mutex_exit(vp->v_interlock); return false; } mutex_exit(vp->v_interlock); /* * On a leaf file system this lock will always succeed as we hold * the last reference and prevent further references. * On layered file systems waiting for the lock would open a can of * deadlocks as the lower vnodes may have other active references. */ error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); mutex_enter(vp->v_interlock); if (error) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); mutex_exit(vp->v_interlock); return false; } KASSERT(vrefcnt(vp) == 1); vcache_reclaim(vp); vrelel(vp, 0, LK_NONE); return true; } /* * Helper for vrevoke() to propagate suspension from lastmp * to thismp. Both args may be NULL. * Returns the currently suspended file system or NULL. */ static struct mount * vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp) { int error; if (lastmp == thismp) return thismp; if (lastmp != NULL) vfs_resume(lastmp); if (thismp == NULL) return NULL; do { error = vfs_suspend(thismp, 0); } while (error == EINTR || error == ERESTART); if (error == 0) return thismp; KASSERT(error == EOPNOTSUPP || error == ENOENT); return NULL; } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ void vrevoke(vnode_t *vp) { struct mount *mp; vnode_t *vq; enum vtype type; dev_t dev; KASSERT(vrefcnt(vp) > 0); mp = vrevoke_suspend_next(NULL, vp->v_mount); mutex_enter(vp->v_interlock); VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMED) { mutex_exit(vp->v_interlock); } else if (vp->v_type != VBLK && vp->v_type != VCHR) { atomic_inc_uint(&vp->v_usecount); mutex_exit(vp->v_interlock); vgone(vp); } else { dev = vp->v_rdev; type = vp->v_type; mutex_exit(vp->v_interlock); while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq) == 0) { mp = vrevoke_suspend_next(mp, vq->v_mount); vgone(vq); } } vrevoke_suspend_next(mp, NULL); } /* * Eliminate all activity associated with a vnode in preparation for * reuse. Drops a reference from the vnode. */ void vgone(vnode_t *vp) { int lktype; KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); lktype = LK_EXCLUSIVE; mutex_enter(vp->v_interlock); VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_LOADED) { VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); vcache_reclaim(vp); lktype = LK_NONE; } VSTATE_ASSERT(vp, VS_RECLAIMED); vrelel(vp, 0, lktype); } static inline uint32_t vcache_hash(const struct vcache_key *key) { uint32_t hash = HASH32_BUF_INIT; KASSERT(key->vk_key_len > 0); hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); hash = hash32_buf(key->vk_key, key->vk_key_len, hash); return hash; } static int vcache_stats(struct hashstat_sysctl *hs, bool fill) { vnode_impl_t *vip; uint64_t chain; strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name)); strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc)); if (!fill) return 0; hs->hash_size = vcache_hashmask + 1; for (size_t i = 0; i < hs->hash_size; i++) { chain = 0; mutex_enter(&vcache_lock); SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) { chain++; } mutex_exit(&vcache_lock); if (chain > 0) { hs->hash_used++; hs->hash_items += chain; if (chain > hs->hash_maxchain) hs->hash_maxchain = chain; } preempt_point(); } return 0; } static void vcache_init(void) { vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit, 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); KASSERT(vcache_pool != NULL); mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&vcache_cv, "vcache"); vcache_hashsize = desiredvnodes; vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, &vcache_hashmask); hashstat_register("vcache", vcache_stats); } static void vcache_reinit(void) { int i; uint32_t hash; u_long oldmask, newmask; struct hashhead *oldtab, *newtab; vnode_impl_t *vip; newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); mutex_enter(&vcache_lock); oldtab = vcache_hashtab; oldmask = vcache_hashmask; vcache_hashsize = desiredvnodes; vcache_hashtab = newtab; vcache_hashmask = newmask; for (i = 0; i <= oldmask; i++) { while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); hash = vcache_hash(&vip->vi_key); SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], vip, vi_hash); } } mutex_exit(&vcache_lock); hashdone(oldtab, HASH_SLIST, oldmask); } static inline vnode_impl_t * vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) { struct hashhead *hashp; vnode_impl_t *vip; KASSERT(mutex_owned(&vcache_lock)); hashp = &vcache_hashtab[hash & vcache_hashmask]; SLIST_FOREACH(vip, hashp, vi_hash) { if (key->vk_mount != vip->vi_key.vk_mount) continue; if (key->vk_key_len != vip->vi_key.vk_key_len) continue; if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) continue; return vip; } return NULL; } /* * Allocate a new, uninitialized vcache node. */ static vnode_impl_t * vcache_alloc(void) { vnode_impl_t *vip; vnode_t *vp; vip = pool_cache_get(vcache_pool, PR_WAITOK); vp = VIMPL_TO_VNODE(vip); memset(vip, 0, sizeof(*vip)); rw_init(&vip->vi_lock); vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1); klist_init(&vip->vi_klist.vk_klist); vp->v_klist = &vip->vi_klist; cv_init(&vp->v_cv, "vnode"); cache_vnode_init(vp); vp->v_usecount = 1; vp->v_type = VNON; vp->v_size = vp->v_writesize = VSIZENOTSET; vip->vi_state = VS_LOADING; lru_requeue(vp, &lru_list[LRU_FREE]); return vip; } /* * Deallocate a vcache node in state VS_LOADING. * * vcache_lock held on entry and released on return. */ static void vcache_dealloc(vnode_impl_t *vip) { vnode_t *vp; KASSERT(mutex_owned(&vcache_lock)); vp = VIMPL_TO_VNODE(vip); vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); mutex_enter(vp->v_interlock); vp->v_op = dead_vnodeop_p; VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); mutex_exit(&vcache_lock); vrelel(vp, 0, LK_NONE); } /* * Free an unused, unreferenced vcache node. * v_interlock locked on entry. */ static void vcache_free(vnode_impl_t *vip) { vnode_t *vp; vp = VIMPL_TO_VNODE(vip); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) == 0); KASSERT(vp->v_holdcnt == 0); KASSERT(vp->v_writecount == 0); lru_requeue(vp, NULL); mutex_exit(vp->v_interlock); vfs_insmntque(vp, NULL); if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp); mutex_obj_free(vp->v_interlock); rw_destroy(&vip->vi_lock); uvm_obj_destroy(&vp->v_uobj, true); KASSERT(vp->v_klist == &vip->vi_klist); klist_fini(&vip->vi_klist.vk_klist); cv_destroy(&vp->v_cv); cache_vnode_fini(vp); pool_cache_put(vcache_pool, vip); } /* * Try to get an initial reference on this cached vnode. * Returns zero on success or EBUSY if the vnode state is not LOADED. * * NB: lockless code sequences may rely on this not blocking. */ int vcache_tryvget(vnode_t *vp) { u_int use, next; for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_GATE) == 0)) { return EBUSY; } next = atomic_cas_uint(&vp->v_usecount, use, (use + 1) | VUSECOUNT_VGET); if (__predict_true(next == use)) { membar_acquire(); return 0; } } } /* * Try to get an initial reference on this cached vnode. * Returns zero on success and ENOENT if the vnode has been reclaimed. * Will wait for the vnode state to be stable. * * v_interlock locked on entry and unlocked on exit. */ int vcache_vget(vnode_t *vp) { int error; KASSERT(mutex_owned(vp->v_interlock)); /* Increment hold count to prevent vnode from disappearing. */ vp->v_holdcnt++; VSTATE_WAIT_STABLE(vp); vp->v_holdcnt--; /* If this was the last reference to a reclaimed vnode free it now. */ if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) vcache_free(VNODE_TO_VIMPL(vp)); else mutex_exit(vp->v_interlock); return ENOENT; } VSTATE_ASSERT(vp, VS_LOADED); error = vcache_tryvget(vp); KASSERT(error == 0); mutex_exit(vp->v_interlock); return 0; } /* * Get a vnode / fs node pair by key and return it referenced through vpp. */ int vcache_get(struct mount *mp, const void *key, size_t key_len, struct vnode **vpp) { int error; uint32_t hash; const void *new_key; struct vnode *vp; struct vcache_key vcache_key; vnode_impl_t *vip, *new_vip; new_key = NULL; *vpp = NULL; vcache_key.vk_mount = mp; vcache_key.vk_key = key; vcache_key.vk_key_len = key_len; hash = vcache_hash(&vcache_key); again: mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&vcache_key, hash); /* If found, take a reference or retry. */ if (__predict_true(vip != NULL)) { /* * If the vnode is loading we cannot take the v_interlock * here as it might change during load (see uvm_obj_setlock()). * As changing state from VS_LOADING requires both vcache_lock * and v_interlock it is safe to test with vcache_lock held. * * Wait for vnodes changing state from VS_LOADING and retry. */ if (__predict_false(vip->vi_state == VS_LOADING)) { cv_wait(&vcache_cv, &vcache_lock); mutex_exit(&vcache_lock); goto again; } vp = VIMPL_TO_VNODE(vip); mutex_enter(vp->v_interlock); mutex_exit(&vcache_lock); error = vcache_vget(vp); if (error == ENOENT) goto again; if (error == 0) *vpp = vp; KASSERT((error != 0) == (*vpp == NULL)); return error; } mutex_exit(&vcache_lock); /* Allocate and initialize a new vcache / vnode pair. */ error = vfs_busy(mp); if (error) return error; new_vip = vcache_alloc(); new_vip->vi_key = vcache_key; vp = VIMPL_TO_VNODE(new_vip); mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&vcache_key, hash); if (vip == NULL) { SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], new_vip, vi_hash); vip = new_vip; } /* If another thread beat us inserting this node, retry. */ if (vip != new_vip) { vcache_dealloc(new_vip); vfs_unbusy(mp); goto again; } mutex_exit(&vcache_lock); /* Load the fs node. Exclusive as new_node is VS_LOADING. */ error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); if (error) { mutex_enter(&vcache_lock); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], new_vip, vnode_impl, vi_hash); vcache_dealloc(new_vip); vfs_unbusy(mp); KASSERT(*vpp == NULL); return error; } KASSERT(new_key != NULL); KASSERT(memcmp(key, new_key, key_len) == 0); KASSERT(vp->v_op != NULL); vfs_insmntque(vp, mp); if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE; vfs_ref(mp); vfs_unbusy(mp); /* Finished loading, finalize node. */ mutex_enter(&vcache_lock); new_vip->vi_key.vk_key = new_key; mutex_enter(vp->v_interlock); VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); mutex_exit(vp->v_interlock); mutex_exit(&vcache_lock); *vpp = vp; return 0; } /* * Create a new vnode / fs node pair and return it referenced through vpp. */ int vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, kauth_cred_t cred, void *extra, struct vnode **vpp) { int error; uint32_t hash; struct vnode *vp, *ovp; vnode_impl_t *vip, *ovip; *vpp = NULL; /* Allocate and initialize a new vcache / vnode pair. */ error = vfs_busy(mp); if (error) return error; vip = vcache_alloc(); vip->vi_key.vk_mount = mp; vp = VIMPL_TO_VNODE(vip); /* Create and load the fs node. */ error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra, &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); if (error) { mutex_enter(&vcache_lock); vcache_dealloc(vip); vfs_unbusy(mp); KASSERT(*vpp == NULL); return error; } KASSERT(vp->v_op != NULL); KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); if (vip->vi_key.vk_key_len > 0) { KASSERT(vip->vi_key.vk_key != NULL); hash = vcache_hash(&vip->vi_key); /* * Wait for previous instance to be reclaimed, * then insert new node. */ mutex_enter(&vcache_lock); while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { ovp = VIMPL_TO_VNODE(ovip); mutex_enter(ovp->v_interlock); mutex_exit(&vcache_lock); error = vcache_vget(ovp); KASSERT(error == ENOENT); mutex_enter(&vcache_lock); } SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], vip, vi_hash); mutex_exit(&vcache_lock); } vfs_insmntque(vp, mp); if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE; vfs_ref(mp); vfs_unbusy(mp); /* Finished loading, finalize node. */ mutex_enter(&vcache_lock); mutex_enter(vp->v_interlock); VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED); mutex_exit(&vcache_lock); mutex_exit(vp->v_interlock); *vpp = vp; return 0; } /* * Prepare key change: update old cache nodes key and lock new cache node. * Return an error if the new node already exists. */ int vcache_rekey_enter(struct mount *mp, struct vnode *vp, const void *old_key, size_t old_key_len, const void *new_key, size_t new_key_len) { uint32_t old_hash, new_hash; struct vcache_key old_vcache_key, new_vcache_key; vnode_impl_t *vip, *new_vip; old_vcache_key.vk_mount = mp; old_vcache_key.vk_key = old_key; old_vcache_key.vk_key_len = old_key_len; old_hash = vcache_hash(&old_vcache_key); new_vcache_key.vk_mount = mp; new_vcache_key.vk_key = new_key; new_vcache_key.vk_key_len = new_key_len; new_hash = vcache_hash(&new_vcache_key); new_vip = vcache_alloc(); new_vip->vi_key = new_vcache_key; /* Insert locked new node used as placeholder. */ mutex_enter(&vcache_lock); vip = vcache_hash_lookup(&new_vcache_key, new_hash); if (vip != NULL) { vcache_dealloc(new_vip); return EEXIST; } SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], new_vip, vi_hash); /* Replace old nodes key with the temporary copy. */ vip = vcache_hash_lookup(&old_vcache_key, old_hash); KASSERT(vip != NULL); KASSERT(VIMPL_TO_VNODE(vip) == vp); KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); vip->vi_key = old_vcache_key; mutex_exit(&vcache_lock); return 0; } /* * Key change complete: update old node and remove placeholder. */ void vcache_rekey_exit(struct mount *mp, struct vnode *vp, const void *old_key, size_t old_key_len, const void *new_key, size_t new_key_len) { uint32_t old_hash, new_hash; struct vcache_key old_vcache_key, new_vcache_key; vnode_impl_t *vip, *new_vip; struct vnode *new_vp; old_vcache_key.vk_mount = mp; old_vcache_key.vk_key = old_key; old_vcache_key.vk_key_len = old_key_len; old_hash = vcache_hash(&old_vcache_key); new_vcache_key.vk_mount = mp; new_vcache_key.vk_key = new_key; new_vcache_key.vk_key_len = new_key_len; new_hash = vcache_hash(&new_vcache_key); mutex_enter(&vcache_lock); /* Lookup old and new node. */ vip = vcache_hash_lookup(&old_vcache_key, old_hash); KASSERT(vip != NULL); KASSERT(VIMPL_TO_VNODE(vip) == vp); new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); KASSERT(new_vip != NULL); KASSERT(new_vip->vi_key.vk_key_len == new_key_len); new_vp = VIMPL_TO_VNODE(new_vip); mutex_enter(new_vp->v_interlock); VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); mutex_exit(new_vp->v_interlock); /* Rekey old node and put it onto its new hashlist. */ vip->vi_key = new_vcache_key; if (old_hash != new_hash) { SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], vip, vnode_impl, vi_hash); SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], vip, vi_hash); } /* Remove new node used as placeholder. */ SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], new_vip, vnode_impl, vi_hash); vcache_dealloc(new_vip); } /* * Disassociate the underlying file system from a vnode. * * Must be called with vnode locked and will return unlocked. * Must be called with the interlock held, and will return with it held. */ static void vcache_reclaim(vnode_t *vp) { lwp_t *l = curlwp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); struct mount *mp = vp->v_mount; uint32_t hash; uint8_t temp_buf[64], *temp_key; size_t temp_key_len; bool recycle; int error; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0); temp_key_len = vip->vi_key.vk_key_len; /* * Prevent the vnode from being recycled or brought into use * while we clean it out. */ VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING); /* * Send NOTE_REVOKE now, before we call VOP_RECLAIM(), * because VOP_RECLAIM() could cause vp->v_klist to * become invalid. Don't check for interest in NOTE_REVOKE * here; it's always posted because it sets EV_EOF. * * Once it's been posted, reset vp->v_klist to point to * our own local storage, in case we were sharing with * someone else. */ KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE); vp->v_klist = &vip->vi_klist; mutex_exit(vp->v_interlock); rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); mutex_enter(vp->v_interlock); if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages); } vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */ mutex_exit(vp->v_interlock); rw_exit(vp->v_uobj.vmobjlock); /* * With vnode state set to reclaiming, purge name cache immediately * to prevent new handles on vnode, and wait for existing threads * trying to get a handle to notice VS_RECLAIMED status and abort. */ cache_purge(vp); /* Replace the vnode key with a temporary copy. */ if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { temp_key = kmem_alloc(temp_key_len, KM_SLEEP); } else { temp_key = temp_buf; } if (vip->vi_key.vk_key_len > 0) { mutex_enter(&vcache_lock); memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); vip->vi_key.vk_key = temp_key; mutex_exit(&vcache_lock); } fstrans_start(mp); /* * Clean out any cached data associated with the vnode. */ error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); if (error != 0) { if (wapbl_vphaswapbl(vp)) WAPBL_DISCARD(wapbl_vptomp(vp)); error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); } KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp); } /* * Disassociate the underlying file system from the vnode. * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks * the vnode, and may destroy the vnode so that VOP_UNLOCK * would no longer function. */ VOP_INACTIVE(vp, &recycle); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); if (VOP_RECLAIM(vp)) { vnpanic(vp, "%s: cannot reclaim", __func__); } KASSERT(vp->v_data == NULL); KASSERT((vp->v_iflag & VI_PAGES) == 0); if (vp->v_type == VREG && vp->v_ractx != NULL) { uvm_ra_freectx(vp->v_ractx); vp->v_ractx = NULL; } if (vip->vi_key.vk_key_len > 0) { /* Remove from vnode cache. */ hash = vcache_hash(&vip->vi_key); mutex_enter(&vcache_lock); KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], vip, vnode_impl, vi_hash); mutex_exit(&vcache_lock); } if (temp_key != temp_buf) kmem_free(temp_key, temp_key_len); /* Done with purge, notify sleepers of the grim news. */ mutex_enter(vp->v_interlock); vp->v_op = dead_vnodeop_p; VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); vp->v_tag = VT_NON; mutex_exit(vp->v_interlock); /* * Move to dead mount. Must be after changing the operations * vector as vnode operations enter the mount before using the * operations vector. See sys/kern/vnode_if.c. */ vp->v_vflag &= ~VV_ROOT; vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); #ifdef PAX_SEGVGUARD pax_segvguard_cleanup(vp); #endif /* PAX_SEGVGUARD */ mutex_enter(vp->v_interlock); fstrans_done(mp); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); } /* * Disassociate the underlying file system from an open device vnode * and make it anonymous. * * Vnode unlocked on entry, drops a reference to the vnode. */ void vcache_make_anon(vnode_t *vp) { vnode_impl_t *vip = VNODE_TO_VIMPL(vp); uint32_t hash; bool recycle; KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount)); VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); /* Remove from vnode cache. */ hash = vcache_hash(&vip->vi_key); mutex_enter(&vcache_lock); KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], vip, vnode_impl, vi_hash); vip->vi_key.vk_mount = dead_rootmount; vip->vi_key.vk_key_len = 0; vip->vi_key.vk_key = NULL; mutex_exit(&vcache_lock); /* * Disassociate the underlying file system from the vnode. * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks * the vnode, and may destroy the vnode so that VOP_UNLOCK * would no longer function. */ if (vn_lock(vp, LK_EXCLUSIVE)) { vnpanic(vp, "%s: cannot lock", __func__); } VOP_INACTIVE(vp, &recycle); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); if (VOP_RECLAIM(vp)) { vnpanic(vp, "%s: cannot reclaim", __func__); } /* Purge name cache. */ cache_purge(vp); /* Done with purge, change operations vector. */ mutex_enter(vp->v_interlock); vp->v_op = spec_vnodeop_p; vp->v_vflag |= VV_MPSAFE; mutex_exit(vp->v_interlock); /* * Move to dead mount. Must be after changing the operations * vector as vnode operations enter the mount before using the * operations vector. See sys/kern/vnode_if.c. */ vfs_ref(dead_rootmount); vfs_insmntque(vp, dead_rootmount); vrele(vp); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(struct buf *bp) { vnode_t *vp; if ((vp = bp->b_vp) == NULL) return; KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(bp->b_objlock)); if (--vp->v_numoutput < 0) vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); if (vp->v_numoutput == 0) cv_broadcast(&vp->v_cv); } /* * Test a vnode for being or becoming dead. Returns one of: * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. * ENOENT: vnode is dead. * 0: otherwise. * * Whenever this function returns a non-zero value all future * calls will also return a non-zero value. */ int vdead_check(struct vnode *vp, int flags) { KASSERT(mutex_owned(vp->v_interlock)); if (! ISSET(flags, VDEAD_NOWAIT)) VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMING) { KASSERT(ISSET(flags, VDEAD_NOWAIT)); return EBUSY; } else if (VSTATE_GET(vp) == VS_RECLAIMED) { return ENOENT; } return 0; } int vfs_drainvnodes(void) { mutex_enter(&vdrain_lock); if (!vdrain_one(desiredvnodes)) { mutex_exit(&vdrain_lock); return EBUSY; } mutex_exit(&vdrain_lock); if (vcache_hashsize != desiredvnodes) vcache_reinit(); return 0; } void vnpanic(vnode_t *vp, const char *fmt, ...) { va_list ap; #ifdef DIAGNOSTIC vprint(NULL, vp); #endif va_start(ap, fmt); vpanic(fmt, ap); va_end(ap); } void vshareilock(vnode_t *tvp, vnode_t *fvp) { kmutex_t *oldlock; oldlock = tvp->v_interlock; mutex_obj_hold(fvp->v_interlock); tvp->v_interlock = fvp->v_interlock; mutex_obj_free(oldlock); } void vshareklist(vnode_t *tvp, vnode_t *fvp) { /* * If two vnodes share klist state, they must also share * an interlock. */ KASSERT(tvp->v_interlock == fvp->v_interlock); /* * We make the following assumptions: * * ==> Some other synchronization is happening outside of * our view to make this safe. * * ==> That the "to" vnode will have the necessary references * on the "from" vnode so that the storage for the klist * won't be yanked out from beneath us (the vnode_impl). * * ==> If "from" is also sharing, we then assume that "from" * has the necessary references, and so on. */ tvp->v_klist = fvp->v_klist; }
2 2 7 4 1 1 1 7 7 7 7 7 7 7 7 7 2 5 3 1 2 1 2 2 2 2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 /* $NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $ */ /*- * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * Copyright (c) 1993 Jan-Simon Pendry * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95 */ /* * procfs vnode interface */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $"); #include <sys/param.h> #include <sys/atomic.h> #include <sys/systm.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/proc.h> #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> #include <sys/mount.h> #include <sys/dirent.h> #include <sys/resourcevar.h> #include <sys/stat.h> #include <sys/ptrace.h> #include <sys/kauth.h> #include <sys/exec.h> #include <uvm/uvm_extern.h> /* for PAGE_SIZE */ #include <machine/reg.h> #include <miscfs/genfs/genfs.h> #include <miscfs/procfs/procfs.h> /* * Vnode Operations. * */ static int procfs_validfile_linux(struct lwp *, struct mount *); static int procfs_root_readdir_callback(struct proc *, void *); static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *, size_t); /* * This is a list of the valid names in the * process-specific sub-directories. It is * used in procfs_lookup and procfs_readdir */ static const struct proc_target { u_char pt_type; u_char pt_namlen; const char *pt_name; pfstype pt_pfstype; int (*pt_valid)(struct lwp *, struct mount *); } proc_targets[] = { #define N(s) sizeof(s)-1, s /* name type validp */ { DT_DIR, N("."), PFSproc, NULL }, { DT_DIR, N(".."), PFSroot, NULL }, { DT_DIR, N("fd"), PFSfd, NULL }, { DT_DIR, N("task"), PFStask, procfs_validfile_linux }, { DT_LNK, N("cwd"), PFScwd, NULL }, { DT_REG, N("emul"), PFSemul, NULL }, { DT_LNK, N("root"), PFSchroot, NULL }, { DT_REG, N("auxv"), PFSauxv, procfs_validauxv }, { DT_REG, N("cmdline"), PFScmdline, NULL }, { DT_REG, N("environ"), PFSenviron, NULL }, { DT_LNK, N("exe"), PFSexe, procfs_validfile }, { DT_REG, N("file"), PFSfile, procfs_validfile }, { DT_REG, N("fpregs"), PFSfpregs, procfs_validfpregs }, { DT_REG, N("limit"), PFSlimit, NULL }, { DT_REG, N("map"), PFSmap, procfs_validmap }, { DT_REG, N("maps"), PFSmaps, procfs_validmap }, { DT_REG, N("mem"), PFSmem, NULL }, { DT_REG, N("note"), PFSnote, NULL }, { DT_REG, N("notepg"), PFSnotepg, NULL }, { DT_REG, N("regs"), PFSregs, procfs_validregs }, { DT_REG, N("stat"), PFSstat, procfs_validfile_linux }, { DT_REG, N("statm"), PFSstatm, procfs_validfile_linux }, { DT_REG, N("status"), PFSstatus, NULL }, #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_DEFNS #endif #undef N }; static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]); /* * List of files in the root directory. Note: the validate function will * be called with p == NULL for these ones. */ static const struct proc_target proc_root_targets[] = { #define N(s) sizeof(s)-1, s /* name type validp */ { DT_REG, N("meminfo"), PFSmeminfo, procfs_validfile_linux }, { DT_REG, N("cpuinfo"), PFScpuinfo, procfs_validfile_linux }, { DT_REG, N("uptime"), PFSuptime, procfs_validfile_linux }, { DT_REG, N("mounts"), PFSmounts, procfs_validfile_linux }, { DT_REG, N("devices"), PFSdevices, procfs_validfile_linux }, { DT_REG, N("stat"), PFScpustat, procfs_validfile_linux }, { DT_REG, N("loadavg"), PFSloadavg, procfs_validfile_linux }, { DT_REG, N("version"), PFSversion, procfs_validfile_linux }, #undef N }; static const int nproc_root_targets = sizeof(proc_root_targets) / sizeof(proc_root_targets[0]); int procfs_lookup(void *); int procfs_open(void *); int procfs_close(void *); int procfs_access(void *); int procfs_getattr(void *); int procfs_setattr(void *); int procfs_readdir(void *); int procfs_readlink(void *); int procfs_inactive(void *); int procfs_reclaim(void *); int procfs_print(void *); int procfs_pathconf(void *); int procfs_getpages(void *); static uint8_t fttodt(file_t *); static int atoi(const char *, size_t); /* * procfs vnode operations. */ int (**procfs_vnodeop_p)(void *); const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = { { &vop_default_desc, vn_default_error }, { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ { &vop_lookup_desc, procfs_lookup }, /* lookup */ { &vop_create_desc, genfs_eopnotsupp }, /* create */ { &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */ { &vop_open_desc, procfs_open }, /* open */ { &vop_close_desc, procfs_close }, /* close */ { &vop_access_desc, procfs_access }, /* access */ { &vop_accessx_desc, genfs_accessx }, /* accessx */ { &vop_getattr_desc, procfs_getattr }, /* getattr */ { &vop_setattr_desc, procfs_setattr }, /* setattr */ { &vop_read_desc, procfs_rw }, /* read */ { &vop_write_desc, procfs_rw }, /* write */ { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ { &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */ { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ { &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */ { &vop_poll_desc, genfs_poll }, /* poll */ { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */ { &vop_revoke_desc, genfs_revoke }, /* revoke */ { &vop_fsync_desc, genfs_nullop }, /* fsync */ { &vop_seek_desc, genfs_nullop }, /* seek */ { &vop_remove_desc, genfs_eopnotsupp }, /* remove */ { &vop_link_desc, genfs_erofs_link }, /* link */ { &vop_rename_desc, genfs_eopnotsupp }, /* rename */ { &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */ { &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */ { &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */ { &vop_readdir_desc, procfs_readdir }, /* readdir */ { &vop_readlink_desc, procfs_readlink }, /* readlink */ { &vop_abortop_desc, genfs_abortop }, /* abortop */ { &vop_inactive_desc, procfs_inactive }, /* inactive */ { &vop_reclaim_desc, procfs_reclaim }, /* reclaim */ { &vop_lock_desc, genfs_lock }, /* lock */ { &vop_unlock_desc, genfs_unlock }, /* unlock */ { &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */ { &vop_strategy_desc, genfs_badop }, /* strategy */ { &vop_print_desc, procfs_print }, /* print */ { &vop_islocked_desc, genfs_islocked }, /* islocked */ { &vop_pathconf_desc, procfs_pathconf }, /* pathconf */ { &vop_advlock_desc, genfs_einval }, /* advlock */ { &vop_getpages_desc, procfs_getpages }, /* getpages */ { &vop_putpages_desc, genfs_null_putpages }, /* putpages */ { NULL, NULL } }; const struct vnodeopv_desc procfs_vnodeop_opv_desc = { &procfs_vnodeop_p, procfs_vnodeop_entries }; /* * set things up for doing i/o on * the pfsnode (vp). (vp) is locked * on entry, and should be left locked * on exit. * * for procfs we don't need to do anything * in particular for i/o. all that is done * is to support exclusive open on process * memory images. */ int procfs_open(void *v) { struct vop_open_args /* { struct vnode *a_vp; int a_mode; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct lwp *l1; struct proc *p2; int error; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0) return error; l1 = curlwp; /* tracer */ #define M2K(m) (((m) & FREAD) && ((m) & FWRITE) ? \ KAUTH_REQ_PROCESS_PROCFS_RW : \ (m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \ KAUTH_REQ_PROCESS_PROCFS_READ) mutex_enter(p2->p_lock); error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS, p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL); mutex_exit(p2->p_lock); if (error) { procfs_proc_unlock(p2); return (error); } #undef M2K switch (pfs->pfs_type) { case PFSmem: if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) || ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) { error = EBUSY; break; } if (!proc_isunder(p2, l1)) { error = EPERM; break; } if (ap->a_mode & FWRITE) pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL); break; case PFSregs: case PFSfpregs: if (!proc_isunder(p2, l1)) { error = EPERM; break; } break; default: break; } procfs_proc_unlock(p2); return (error); } /* * close the pfsnode (vp) after doing i/o. * (vp) is not locked on entry or exit. * * nothing to do for procfs other than undo * any exclusive open flag (see _open above). */ int procfs_close(void *v) { struct vop_close_args /* { struct vnode *a_vp; int a_fflag; kauth_cred_t a_cred; } */ *ap = v; struct pfsnode *pfs = VTOPFS(ap->a_vp); switch (pfs->pfs_type) { case PFSmem: if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL)) pfs->pfs_flags &= ~(FWRITE|O_EXCL); break; default: break; } return (0); } /* * _inactive is called when the pfsnode * is vrele'd and the reference count goes * to zero. (vp) will be on the vnode free * list, so to get it back vget() must be * used. * * (vp) is locked on entry, but must be unlocked on exit. */ int procfs_inactive(void *v) { struct vop_inactive_v2_args /* { struct vnode *a_vp; bool *a_recycle; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); mutex_enter(&proc_lock); *ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL); mutex_exit(&proc_lock); return (0); } /* * _reclaim is called when getnewvnode() * wants to make use of an entry on the vnode * free list. at this time the filesystem needs * to free any private data and remove the node * from any private lists. */ int procfs_reclaim(void *v) { struct vop_reclaim_v2_args /* { struct vnode *a_vp; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); VOP_UNLOCK(vp); /* * To interlock with procfs_revoke_vnodes(). */ mutex_enter(vp->v_interlock); vp->v_data = NULL; mutex_exit(vp->v_interlock); procfs_hashrem(pfs); kmem_free(pfs, sizeof(*pfs)); return 0; } /* * Return POSIX pathconf information applicable to special devices. */ int procfs_pathconf(void *v) { struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap = v; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_MAX_CANON: *ap->a_retval = MAX_CANON; return (0); case _PC_MAX_INPUT: *ap->a_retval = MAX_INPUT; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_VDISABLE: *ap->a_retval = _POSIX_VDISABLE; return (0); case _PC_SYNC_IO: *ap->a_retval = 1; return (0); default: return genfs_pathconf(ap); } /* NOTREACHED */ } /* * _print is used for debugging. * just print a readable description * of (vp). */ int procfs_print(void *v) { struct vop_print_args /* { struct vnode *a_vp; } */ *ap = v; struct pfsnode *pfs = VTOPFS(ap->a_vp); printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n", pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags); return 0; } /* * Works out the path to the target process's current * working directory or chroot. If the caller is in a chroot and * can't "reach" the target's cwd or root (or some other error * occurs), a "/" is returned for the path. */ static void procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp, char *path, size_t len) { struct cwdinfo *cwdi; struct vnode *vp, *rvp; char *bp; /* * Lock target cwdi and take a reference to the vnode * we are interested in to prevent it from disappearing * before getcwd_common() below. */ rw_enter(&target->p_cwdi->cwdi_lock, RW_READER); switch (t) { case PFScwd: vp = target->p_cwdi->cwdi_cdir; break; case PFSchroot: vp = target->p_cwdi->cwdi_rdir; break; default: rw_exit(&target->p_cwdi->cwdi_lock); return; } if (vp != NULL) vref(vp); rw_exit(&target->p_cwdi->cwdi_lock); cwdi = caller->l_proc->p_cwdi; rw_enter(&cwdi->cwdi_lock, RW_READER); rvp = cwdi->cwdi_rdir; bp = bpp ? *bpp : NULL; /* * XXX: this horrible kludge avoids locking panics when * attempting to lookup links that point to within procfs */ if (vp != NULL && vp->v_tag == VT_PROCFS) { if (bpp) { *--bp = '/'; *bpp = bp; } vrele(vp); rw_exit(&cwdi->cwdi_lock); return; } if (rvp == NULL) rvp = rootvnode; if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path, len / 2, 0, caller) != 0) { if (bpp) { bp = *bpp; *--bp = '/'; } } if (bpp) *bpp = bp; if (vp != NULL) vrele(vp); rw_exit(&cwdi->cwdi_lock); } /* * Invent attributes for pfsnode (vp) and store * them in (vap). * Directories lengths are returned as zero since * any real length would require the genuine size * to be computed, and nothing cares anyway. * * this is relatively minimal for procfs. */ int procfs_getattr(void *v) { struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct vattr *vap = ap->a_vap; struct proc *procp; char *path, *bp, bf[16]; int error; /* first check the process still exists */ switch (pfs->pfs_type) { case PFSroot: case PFScurproc: case PFSself: procp = NULL; break; default: error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT); if (error != 0) return (error); break; } switch (pfs->pfs_type) { case PFStask: if (pfs->pfs_fd == -1) { path = NULL; break; } /*FALLTHROUGH*/ case PFScwd: case PFSchroot: path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK); if (path == NULL && procp != NULL) { procfs_proc_unlock(procp); return (ENOMEM); } break; default: path = NULL; break; } if (procp != NULL) { mutex_enter(procp->p_lock); error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, procp, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL); mutex_exit(procp->p_lock); if (error != 0) { procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP); return (ENOENT); } } error = 0; /* start by zeroing out the attributes */ vattr_null(vap); /* next do all the common fields */ vap->va_type = ap->a_vp->v_type; vap->va_mode = pfs->pfs_mode; vap->va_fileid = pfs->pfs_fileno; vap->va_flags = 0; vap->va_blocksize = PAGE_SIZE; /* * Make all times be current TOD. * * It would be possible to get the process start * time from the p_stats structure, but there's * no "file creation" time stamp anyway, and the * p_stats structure is not addressable if u. gets * swapped out for that process. */ getnanotime(&vap->va_ctime); vap->va_atime = vap->va_mtime = vap->va_ctime; if (procp) TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start, &vap->va_birthtime); else getnanotime(&vap->va_birthtime); switch (pfs->pfs_type) { case PFSmem: case PFSregs: case PFSfpregs: #if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES) PROCFS_MACHDEP_PROTECT_CASES #endif /* * If the process has exercised some setuid or setgid * privilege, then rip away read/write permission so * that only root can gain access. */ if (procp->p_flag & PK_SUGID) vap->va_mode &= ~(S_IRUSR|S_IWUSR); /* FALLTHROUGH */ case PFSstatus: case PFSstat: case PFSnote: case PFSnotepg: case PFScmdline: case PFSenviron: case PFSemul: case PFSstatm: case PFSmap: case PFSmaps: case PFSlimit: case PFSauxv: vap->va_nlink = 1; vap->va_uid = kauth_cred_geteuid(procp->p_cred); vap->va_gid = kauth_cred_getegid(procp->p_cred); break; case PFScwd: case PFSchroot: case PFSmeminfo: case PFSdevices: case PFScpuinfo: case PFSuptime: case PFSmounts: case PFScpustat: case PFSloadavg: case PFSversion: case PFSexe: case PFSself: case PFScurproc: case PFSroot: vap->va_nlink = 1; vap->va_uid = vap->va_gid = 0; break; case PFSproc: case PFStask: case PFSfile: case PFSfd: break; default: panic("%s: %d/1", __func__, pfs->pfs_type); } /* * now do the object specific fields * * The size could be set from struct reg, but it's hardly * worth the trouble, and it puts some (potentially) machine * dependent data into this machine-independent code. If it * becomes important then this function should break out into * a per-file stat function in the corresponding .c file. */ switch (pfs->pfs_type) { case PFSroot: vap->va_bytes = vap->va_size = DEV_BSIZE; break; case PFSself: case PFScurproc: vap->va_bytes = vap->va_size = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid); break; case PFStask: if (pfs->pfs_fd != -1) { vap->va_nlink = 1; vap->va_uid = 0; vap->va_gid = 0; vap->va_bytes = vap->va_size = snprintf(bf, sizeof(bf), ".."); break; } /*FALLTHROUGH*/ case PFSfd: if (pfs->pfs_fd != -1) { file_t *fp; fp = fd_getfile2(procp, pfs->pfs_fd); if (fp == NULL) { error = EBADF; break; } vap->va_nlink = 1; vap->va_uid = kauth_cred_geteuid(fp->f_cred); vap->va_gid = kauth_cred_getegid(fp->f_cred); switch (fp->f_type) { case DTYPE_VNODE: vap->va_bytes = vap->va_size = fp->f_vnode->v_size; break; default: vap->va_bytes = vap->va_size = 0; break; } closef(fp); break; } /*FALLTHROUGH*/ case PFSproc: vap->va_nlink = 2; vap->va_uid = kauth_cred_geteuid(procp->p_cred); vap->va_gid = kauth_cred_getegid(procp->p_cred); vap->va_bytes = vap->va_size = DEV_BSIZE; break; case PFSfile: error = EOPNOTSUPP; break; case PFSmem: vap->va_bytes = vap->va_size = ctob(procp->p_vmspace->vm_tsize + procp->p_vmspace->vm_dsize + procp->p_vmspace->vm_ssize); break; case PFSauxv: vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen; break; #if defined(PT_GETREGS) || defined(PT_SETREGS) case PFSregs: vap->va_bytes = vap->va_size = sizeof(struct reg); break; #endif #if defined(PT_GETFPREGS) || defined(PT_SETFPREGS) case PFSfpregs: vap->va_bytes = vap->va_size = sizeof(struct fpreg); break; #endif case PFSstatus: case PFSstat: case PFSnote: case PFSnotepg: case PFScmdline: case PFSenviron: case PFSmeminfo: case PFSdevices: case PFScpuinfo: case PFSuptime: case PFSmounts: case PFScpustat: case PFSloadavg: case PFSstatm: case PFSversion: vap->va_bytes = vap->va_size = 0; break; case PFSlimit: case PFSmap: case PFSmaps: /* * Advise a larger blocksize for the map files, so that * they may be read in one pass. */ vap->va_blocksize = 4 * PAGE_SIZE; vap->va_bytes = vap->va_size = 0; break; case PFScwd: case PFSchroot: bp = path + MAXPATHLEN; *--bp = '\0'; procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path, MAXPATHLEN); vap->va_bytes = vap->va_size = strlen(bp); break; case PFSexe: vap->va_bytes = vap->va_size = strlen(procp->p_path); break; case PFSemul: vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name); break; #ifdef __HAVE_PROCFS_MACHDEP PROCFS_MACHDEP_NODETYPE_CASES error = procfs_machdep_getattr(ap->a_vp, vap, procp); break; #endif default: panic("%s: %d/2", __func__, pfs->pfs_type); } if (procp != NULL) procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP); return (error); } /*ARGSUSED*/ int procfs_setattr(void *v) { /* * just fake out attribute setting * it's not good to generate an error * return, otherwise things like creat() * will fail when they try to set the * file length to 0. worse, this means * that echo $note > /proc/$pid/note will fail. */ return (0); } /* * implement access checking. * * actually, the check for super-user is slightly * broken since it will allow read access to write-only * objects. this doesn't cause any particular trouble * but does mean that the i/o entry points need to check * that the operation really does make sense. */ int procfs_access(void *v) { struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; kauth_cred_t a_cred; } */ *ap = v; struct vattr va; int error; if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0) return (error); return kauth_authorize_vnode(ap->a_cred, KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode), ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred, va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode)); } /* * lookup. this is incredibly complicated in the * general case, however for most pseudo-filesystems * very little needs to be done. * * Locking isn't hard here, just poorly documented. * * If we're looking up ".", just vref the parent & return it. * * If we're looking up "..", unlock the parent, and lock "..". If everything * went ok, and we're on the last component and the caller requested the * parent locked, try to re-lock the parent. We do this to prevent lock * races. * * For anything else, get the needed node. Then unlock the parent if not * the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the * parent in the .. case). * * We try to exit with the parent locked in error cases. */ int procfs_lookup(void *v) { struct vop_lookup_v2_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap = v; struct componentname *cnp = ap->a_cnp; struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; const char *pname = cnp->cn_nameptr; const struct proc_target *pt = NULL; struct vnode *fvp; pid_t pid, vnpid; struct pfsnode *pfs; struct proc *p = NULL; struct lwp *plwp; int i, error; pfstype type; *vpp = NULL; if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0) return (error); if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) return (EROFS); if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; vref(dvp); return (0); } pfs = VTOPFS(dvp); switch (pfs->pfs_type) { case PFSroot: /* * Shouldn't get here with .. in the root node. */ if (cnp->cn_flags & ISDOTDOT) return (EIO); for (i = 0; i < nproc_root_targets; i++) { pt = &proc_root_targets[i]; /* * check for node match. proc is always NULL here, * so call pt_valid with constant NULL lwp. */ if (cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 && (pt->pt_valid == NULL || (*pt->pt_valid)(NULL, dvp->v_mount))) break; } if (i != nproc_root_targets) { error = procfs_allocvp(dvp->v_mount, vpp, 0, pt->pt_pfstype, -1); return (error); } if (CNEQ(cnp, "curproc", 7)) { pid = curproc->p_pid; vnpid = 0; type = PFScurproc; } else if (CNEQ(cnp, "self", 4)) { pid = curproc->p_pid; vnpid = 0; type = PFSself; } else { pid = (pid_t)atoi(pname, cnp->cn_namelen); vnpid = pid; type = PFSproc; } if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0) break; error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1); procfs_proc_unlock(p); return (error); case PFSproc: if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot, -1); return (error); } if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0) break; mutex_enter(p->p_lock); LIST_FOREACH(plwp, &p->p_lwps, l_sibling) { if (plwp->l_stat != LSZOMB) break; } /* Process is exiting if no-LWPS or all LWPs are LSZOMB */ if (plwp == NULL) { mutex_exit(p->p_lock); procfs_proc_unlock(p); return ESRCH; } lwp_addref(plwp); mutex_exit(p->p_lock); for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) { int found; found = cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 && (pt->pt_valid == NULL || (*pt->pt_valid)(plwp, dvp->v_mount)); if (found) break; } lwp_delref(plwp); if (i == nproc_targets) { procfs_proc_unlock(p); break; } if (pt->pt_pfstype == PFSfile) { fvp = p->p_textvp; /* We already checked that it exists. */ vref(fvp); procfs_proc_unlock(p); *vpp = fvp; return (0); } error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, pt->pt_pfstype, -1); procfs_proc_unlock(p); return (error); case PFSfd: { int fd; file_t *fp; if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ENOENT)) != 0) return error; if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSproc, -1); procfs_proc_unlock(p); return (error); } fd = atoi(pname, cnp->cn_namelen); fp = fd_getfile2(p, fd); if (fp == NULL) { procfs_proc_unlock(p); return ENOENT; } fvp = fp->f_vnode; /* Don't show directories */ if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR && !procfs_proc_is_linux_compat()) { vref(fvp); closef(fp); procfs_proc_unlock(p); *vpp = fvp; return 0; } closef(fp); error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSfd, fd); procfs_proc_unlock(p); return error; } case PFStask: { int xpid; if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p, ENOENT)) != 0) return error; if (cnp->cn_flags & ISDOTDOT) { error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFSproc, -1); procfs_proc_unlock(p); return (error); } xpid = atoi(pname, cnp->cn_namelen); if (xpid != pfs->pfs_pid) { procfs_proc_unlock(p); return ENOENT; } error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid, PFStask, 0); procfs_proc_unlock(p); return error; } default: return (ENOTDIR); } return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS); } int procfs_validfile(struct lwp *l, struct mount *mp) { return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL; } static int procfs_validfile_linux(struct lwp *l, struct mount *mp) { return procfs_use_linux_compat(mp) && (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp)); } struct procfs_root_readdir_ctx { struct uio *uiop; off_t *cookies; int ncookies; off_t off; off_t startoff; int error; }; static int procfs_root_readdir_callback(struct proc *p, void *arg) { struct procfs_root_readdir_ctx *ctxp = arg; struct dirent d; struct uio *uiop; int error; uiop = ctxp->uiop; if (uiop->uio_resid < UIO_MX) return -1; /* no space */ if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0) return 0; if (ctxp->off < ctxp->startoff) { ctxp->off++; return 0; } memset(&d, 0, UIO_MX); d.d_reclen = UIO_MX; d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1); d.d_namlen = snprintf(d.d_name, UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid); d.d_type = DT_DIR; mutex_exit(&proc_lock); error = uiomove(&d, UIO_MX, uiop); mutex_enter(&proc_lock); if (error) { ctxp->error = error; return -1; } ctxp->ncookies++; if (ctxp->cookies) *(ctxp->cookies)++ = ctxp->off + 1; ctxp->off++; return 0; } /* * readdir returns directory entries from pfsnode (vp). * * the strategy here with procfs is to generate a single * directory entry at a time (struct dirent) and then * copy that out to userland using uiomove. a more efficient * though more complex implementation, would try to minimize * the number of calls to uiomove(). for procfs, this is * hardly worth the added code complexity. * * this should just be done through read() */ int procfs_readdir(void *v) { struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; kauth_cred_t a_cred; int *a_eofflag; off_t **a_cookies; int *a_ncookies; } */ *ap = v; struct uio *uio = ap->a_uio; struct dirent d; struct pfsnode *pfs; off_t i; int error; off_t *cookies = NULL; int ncookies; struct vnode *vp; const struct proc_target *pt; struct procfs_root_readdir_ctx ctx; struct proc *p = NULL; struct lwp *l; int nfd; int nc = 0; vp = ap->a_vp; pfs = VTOPFS(vp); if (uio->uio_resid < UIO_MX) return (EINVAL); if (uio->uio_offset < 0) return (EINVAL); error = 0; i = uio->uio_offset; memset(&d, 0, UIO_MX); d.d_reclen = UIO_MX; ncookies = uio->uio_resid / UIO_MX; switch (pfs->pfs_type) { /* * this is for the process-specific sub-directories. * all that is needed to is copy out all the entries * from the procent[] table (top of this file). */ case PFSproc: { if (i >= nproc_targets) return 0; if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0) break; if (ap->a_ncookies) { ncookies = uimin(ncookies, (nproc_targets - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (pt = &proc_targets[i]; uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) { if (pt->pt_valid) { /* XXXSMP LWP can disappear */ mutex_enter(p->p_lock); l = LIST_FIRST(&p->p_lwps); KASSERT(l != NULL); mutex_exit(p->p_lock); if ((*pt->pt_valid)(l, vp->v_mount) == 0) continue; } d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); d.d_namlen = pt->pt_namlen; memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; } procfs_proc_unlock(p); break; } case PFSfd: { file_t *fp; int lim; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0) return error; /* XXX Should this be by file as well? */ if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL, NULL) != 0) { procfs_proc_unlock(p); return ESRCH; } nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles; lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); if (i >= lim) { procfs_proc_unlock(p); return 0; } if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { pt = &proc_targets[i]; d.d_namlen = pt->pt_namlen; d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } if (error) goto out; for (; uio->uio_resid >= UIO_MX && i < nfd; i++) { /* check the descriptor exists */ if ((fp = fd_getfile2(p, i - 2)) == NULL) continue; closef(fp); d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2); d.d_namlen = snprintf(d.d_name, sizeof(d.d_name), "%lld", (long long)(i - 2)); d.d_type = fttodt(fp); if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } goto out; } case PFStask: { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0) return error; nfd = 3; /* ., .., pid */ if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i)); cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } for (; i < 2 && uio->uio_resid >= UIO_MX; i++) { pt = &proc_targets[i]; d.d_namlen = pt->pt_namlen; d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype, -1); (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } if (error) goto out; for (; uio->uio_resid >= UIO_MX && i < nfd; i++) { /* check the descriptor exists */ d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask, i - 2); d.d_namlen = snprintf(d.d_name, sizeof(d.d_name), "%ld", (long)pfs->pfs_pid); d.d_type = DT_LNK; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; if (cookies) *cookies++ = i + 1; nc++; } goto out; } /* * this is for the root of the procfs filesystem * what is needed are special entries for "curproc" * and "self" followed by an entry for each process * on allproc. */ case PFSroot: { if (ap->a_ncookies) { /* * XXX Potentially allocating too much space here, * but I'm lazy. This loop needs some work. */ cookies = malloc(ncookies * sizeof (off_t), M_TEMP, M_WAITOK); *ap->a_cookies = cookies; } /* 0 ... 3 are static entries. */ for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) { switch (i) { case 0: /* `.' */ case 1: /* `..' */ d.d_fileno = PROCFS_FILENO(0, PFSroot, -1); d.d_namlen = i + 1; memcpy(d.d_name, "..", d.d_namlen); d.d_name[i + 1] = '\0'; d.d_type = DT_DIR; break; case 2: d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1); d.d_namlen = sizeof("curproc") - 1; memcpy(d.d_name, "curproc", sizeof("curproc")); d.d_type = DT_LNK; break; case 3: d.d_fileno = PROCFS_FILENO(0, PFSself, -1); d.d_namlen = sizeof("self") - 1; memcpy(d.d_name, "self", sizeof("self")); d.d_type = DT_LNK; break; } if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; nc++; if (cookies) *cookies++ = i + 1; } if (error) break; /* 4 ... are process entries. */ ctx.uiop = uio; ctx.error = 0; ctx.off = 4; ctx.startoff = i; ctx.cookies = cookies; ctx.ncookies = nc; proclist_foreach_call(&allproc, procfs_root_readdir_callback, &ctx); cookies = ctx.cookies; nc = ctx.ncookies; error = ctx.error; if (error) break; /* misc entries. */ if (i < ctx.off) i = ctx.off; if (i >= ctx.off + nproc_root_targets) break; error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH); if (error) break; for (pt = &proc_root_targets[i - ctx.off]; uio->uio_resid >= UIO_MX && pt < &proc_root_targets[nproc_root_targets]; pt++, i++) { if (pt->pt_valid && (*pt->pt_valid)(NULL, vp->v_mount) == 0) continue; if (kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE, p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0) continue; d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1); d.d_namlen = pt->pt_namlen; memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1); d.d_type = pt->pt_type; if ((error = uiomove(&d, UIO_MX, uio)) != 0) break; nc++; if (cookies) *cookies++ = i + 1; } out: KASSERT(p != NULL); ncookies = nc; procfs_proc_unlock(p); break; } default: error = ENOTDIR; break; } if (ap->a_ncookies) { if (error) { if (cookies) free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } else *ap->a_ncookies = ncookies; } uio->uio_offset = i; return (error); } /* * readlink reads the link of `curproc' and others */ int procfs_readlink(void *v) { struct vop_readlink_args *ap = v; char bf[16]; /* should be enough */ char *bp = bf; char *path = NULL; int len = 0; int error = 0; struct vnode *vp = ap->a_vp; struct pfsnode *pfs = VTOPFS(vp); struct proc *pown = NULL; if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1)) len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid); else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1)) len = snprintf(bf, sizeof(bf), "%s", "curproc"); else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0)) len = snprintf(bf, sizeof(bf), ".."); else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; bp = pown->p_path; len = strlen(bp); } else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) || pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) { if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK); if (path == NULL) { procfs_proc_unlock(pown); return (ENOMEM); } bp = path + MAXPATHLEN; *--bp = '\0'; procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown, &bp, path, MAXPATHLEN); len = strlen(bp); } else { file_t *fp; struct vnode *vxp; if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown, ESRCH)) != 0) return error; fp = fd_getfile2(pown, pfs->pfs_fd); if (fp == NULL) { procfs_proc_unlock(pown); return EBADF; } switch (fp->f_type) { case DTYPE_VNODE: vxp = fp->f_vnode; if (vxp->v_type != VDIR && !procfs_proc_is_linux_compat()) { error = EINVAL; break; } if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK)) == NULL) { error = ENOMEM; break; } bp = path + MAXPATHLEN; *--bp = '\0'; /* * XXX: kludge to avoid locking against ourselves * in getcwd() */ if (vxp->v_tag == VT_PROCFS) { *--bp = '/'; } else { rw_enter(&curproc->p_cwdi->cwdi_lock, RW_READER); vp = curproc->p_cwdi->cwdi_rdir; if (vp == NULL) vp = rootvnode; error = getcwd_common(vxp, vp, &bp, path, MAXPATHLEN / 2, 0, curlwp); rw_exit(&curproc->p_cwdi->cwdi_lock); } if (error) break; len = strlen(bp); break; case DTYPE_MISC: len = snprintf(bf, sizeof(bf), "%s", "[misc]"); break; case DTYPE_KQUEUE: len = snprintf(bf, sizeof(bf), "%s", "[kqueue]"); break; case DTYPE_SEM: len = snprintf(bf, sizeof(bf), "%s", "[ksem]"); break; default: error = EINVAL; break; } closef(fp); } if (error == 0) error = uiomove(bp, len, ap->a_uio); if (pown) procfs_proc_unlock(pown); if (path) free(path, M_TEMP); return error; } int procfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; if ((ap->a_flags & PGO_LOCKED) == 0) rw_exit(ap->a_vp->v_uobj.vmobjlock); return (EFAULT); } /* * convert decimal ascii to int */ static int atoi(const char *b, size_t len) { int p = 0; while (len--) { char c = *b++; if (c < '0' || c > '9') return -1; p = 10 * p + (c - '0'); } return p; } /** * convert DTYPE_XXX to corresponding DT_XXX * matching what procfs_loadvnode() does. */ static uint8_t fttodt(file_t *fp) { switch (fp->f_type) { case DTYPE_VNODE: switch (fp->f_vnode->v_type) { case VREG: return DT_REG; case VDIR: return DT_LNK; /* symlink */ case VBLK: return DT_BLK; case VCHR: return DT_CHR; case VLNK: return DT_LNK; case VSOCK: return DT_SOCK; case VFIFO: return DT_FIFO; default: return DT_UNKNOWN; } case DTYPE_PIPE: return DT_FIFO; case DTYPE_SOCKET: return DT_SOCK; case DTYPE_KQUEUE: /*FALLTHROUGH*/ case DTYPE_MISC: /*FALLTHROUGH*/ case DTYPE_SEM: return DT_LNK; /* symlinks */ default: return DT_UNKNOWN; } }
17 17 17 17 17 15 15 15 3 3 2 3 1 2 2 1 1 3 3 3 3 3 3 3 3 1 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 /* $NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp */ /* * uvm_pager.c: generic functions used to assist the pagers. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $"); #include "opt_uvmhist.h" #include "opt_readahead.h" #include "opt_pagermap.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/atomic.h> #include <sys/vnode.h> #include <sys/buf.h> #include <uvm/uvm.h> /* * XXX * this is needed until the device strategy interface * is changed to do physically-addressed i/o. */ #ifndef PAGER_MAP_DEFAULT_SIZE #define PAGER_MAP_DEFAULT_SIZE (16 * 1024 * 1024) #endif #ifndef PAGER_MAP_SIZE #define PAGER_MAP_SIZE PAGER_MAP_DEFAULT_SIZE #endif size_t pager_map_size = PAGER_MAP_SIZE; /* * list of uvm pagers in the system */ const struct uvm_pagerops * const uvmpagerops[] = { &aobj_pager, &uvm_deviceops, &uvm_vnodeops, &ubc_pager, }; /* * the pager map: provides KVA for I/O */ struct vm_map *pager_map; /* XXX */ kmutex_t pager_map_wanted_lock __cacheline_aligned; bool pager_map_wanted; /* locked by pager map */ static vaddr_t emergva; static int emerg_ncolors; static bool emerginuse; void uvm_pager_realloc_emerg(void) { vaddr_t new_emergva, old_emergva; int old_emerg_ncolors; if (__predict_true(emergva != 0 && emerg_ncolors >= uvmexp.ncolors)) return; KASSERT(!emerginuse); new_emergva = uvm_km_alloc(kernel_map, round_page(MAXPHYS) + ptoa(uvmexp.ncolors), ptoa(uvmexp.ncolors), UVM_KMF_VAONLY); KASSERT(new_emergva != 0); old_emergva = emergva; old_emerg_ncolors = emerg_ncolors; /* * don't support re-color in late boot anyway. */ if (0) /* XXX */ mutex_enter(&pager_map_wanted_lock); emergva = new_emergva; emerg_ncolors = uvmexp.ncolors; wakeup(&old_emergva); if (0) /* XXX */ mutex_exit(&pager_map_wanted_lock); if (old_emergva) uvm_km_free(kernel_map, old_emergva, round_page(MAXPHYS) + ptoa(old_emerg_ncolors), UVM_KMF_VAONLY); } /* * uvm_pager_init: init pagers (at boot time) */ void uvm_pager_init(void) { u_int lcv; vaddr_t sva, eva; /* * init pager map */ sva = 0; pager_map = uvm_km_suballoc(kernel_map, &sva, &eva, pager_map_size, 0, false, NULL); mutex_init(&pager_map_wanted_lock, MUTEX_DEFAULT, IPL_NONE); pager_map_wanted = false; uvm_pager_realloc_emerg(); /* * call pager init functions */ for (lcv = 0 ; lcv < __arraycount(uvmpagerops); lcv++) { if (uvmpagerops[lcv]->pgo_init) uvmpagerops[lcv]->pgo_init(); } } #ifdef PMAP_DIRECT /* * uvm_pagermapdirect: map a single page via the pmap's direct segment * * this is an abuse of pmap_direct_process(), since the kva is being grabbed * and no processing is taking place, but for now.. */ static int uvm_pagermapdirect(void *kva, size_t sz, void *cookie) { KASSERT(sz == PAGE_SIZE); *(vaddr_t *)cookie = (vaddr_t)kva; return 0; } #endif /* * uvm_pagermapin: map pages into KVA (pager_map) for I/O that needs mappings * * we basically just map in a blank map entry to reserve the space in the * map and then use pmap_enter() to put the mappings in by hand. */ vaddr_t uvm_pagermapin(struct vm_page **pps, int npages, int flags) { vsize_t size; vaddr_t kva; vaddr_t cva; struct vm_page *pp; vm_prot_t prot; const bool pdaemon = (curlwp == uvm.pagedaemon_lwp); const u_int first_color = VM_PGCOLOR(*pps); UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist,"(pps=%#jx, npages=%jd, first_color=%ju)", (uintptr_t)pps, npages, first_color, 0); #ifdef PMAP_DIRECT /* * for a single page the direct mapped segment can be used. */ if (npages == 1) { int error __diagused; KASSERT((pps[0]->flags & PG_BUSY) != 0); error = pmap_direct_process(VM_PAGE_TO_PHYS(pps[0]), 0, PAGE_SIZE, uvm_pagermapdirect, &kva); KASSERT(error == 0); UVMHIST_LOG(maphist, "<- done, direct (KVA=%#jx)", kva,0,0,0); return kva; } #endif /* * compute protection. outgoing I/O only needs read * access to the page, whereas incoming needs read/write. */ prot = VM_PROT_READ; if (flags & UVMPAGER_MAPIN_READ) prot |= VM_PROT_WRITE; ReStart: size = ptoa(npages); kva = 0; /* let system choose VA */ if (uvm_map(pager_map, &kva, size, NULL, UVM_UNKNOWN_OFFSET, first_color, UVM_FLAG_COLORMATCH | UVM_FLAG_NOMERGE | (pdaemon ? UVM_FLAG_NOWAIT : 0)) != 0) { if (pdaemon) { mutex_enter(&pager_map_wanted_lock); if (emerginuse) { UVM_UNLOCK_AND_WAIT(&emergva, &pager_map_wanted_lock, false, "emergva", 0); goto ReStart; } emerginuse = true; mutex_exit(&pager_map_wanted_lock); kva = emergva + ptoa(first_color); /* The shift implicitly truncates to PAGE_SIZE */ KASSERT(npages <= (MAXPHYS >> PAGE_SHIFT)); goto enter; } if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) { UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0); return(0); } mutex_enter(&pager_map_wanted_lock); pager_map_wanted = true; UVMHIST_LOG(maphist, " SLEEPING on pager_map",0,0,0,0); UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, false, "pager_map", 0); goto ReStart; } enter: /* got it */ for (cva = kva; npages != 0; npages--, cva += PAGE_SIZE) { pp = *pps++; KASSERT(pp); // KASSERT(!((VM_PAGE_TO_PHYS(pp) ^ cva) & uvmexp.colormask)); KASSERT(pp->flags & PG_BUSY); pmap_kenter_pa(cva, VM_PAGE_TO_PHYS(pp), prot, 0); } pmap_update(vm_map_pmap(pager_map)); UVMHIST_LOG(maphist, "<- done (KVA=%#jx)", kva,0,0,0); return(kva); } /* * uvm_pagermapout: remove pager_map mapping * * we remove our mappings by hand and then remove the mapping (waking * up anyone wanting space). */ void uvm_pagermapout(vaddr_t kva, int npages) { vsize_t size = ptoa(npages); struct vm_map_entry *entries; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(maphist, " (kva=%#jx, npages=%jd)", kva, npages,0,0); #ifdef PMAP_DIRECT /* * solitary pages are mapped directly. */ if (npages == 1) { UVMHIST_LOG(maphist,"<- done, direct", 0,0,0,0); return; } #endif /* * duplicate uvm_unmap, but add in pager_map_wanted handling. */ pmap_kremove(kva, size); pmap_update(pmap_kernel()); if ((kva & ~ptoa(uvmexp.colormask)) == emergva) { mutex_enter(&pager_map_wanted_lock); KASSERT(emerginuse); emerginuse = false; wakeup(&emergva); mutex_exit(&pager_map_wanted_lock); return; } vm_map_lock(pager_map); uvm_unmap_remove(pager_map, kva, kva + size, &entries, 0); mutex_enter(&pager_map_wanted_lock); if (pager_map_wanted) { pager_map_wanted = false; wakeup(pager_map); } mutex_exit(&pager_map_wanted_lock); vm_map_unlock(pager_map); if (entries) uvm_unmap_detach(entries, 0); UVMHIST_LOG(maphist,"<- done",0,0,0,0); } void uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, bool write, int error) { struct uvm_object *uobj; struct vm_page *pg; krwlock_t *slock; int pageout_done; /* number of PG_PAGEOUT pages processed */ int swslot __unused; /* used for VMSWAP */ int i; bool swap; UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist); swslot = 0; pageout_done = 0; slock = NULL; uobj = NULL; pg = pgs[0]; swap = (pg->uanon != NULL && pg->uobject == NULL) || (pg->flags & PG_AOBJ) != 0; if (!swap) { uobj = pg->uobject; slock = uobj->vmobjlock; rw_enter(slock, RW_WRITER); } else { #if defined(VMSWAP) if (error) { if (pg->uobject != NULL) { swslot = uao_find_swslot(pg->uobject, pg->offset >> PAGE_SHIFT); } else { KASSERT(pg->uanon != NULL); swslot = pg->uanon->an_swslot; } KASSERT(swslot); } #else /* defined(VMSWAP) */ panic("%s: swap", __func__); #endif /* defined(VMSWAP) */ } for (i = 0; i < npages; i++) { #if defined(VMSWAP) bool anon_disposed = false; /* XXX gcc */ #endif /* defined(VMSWAP) */ pg = pgs[i]; KASSERT(swap || pg->uobject == uobj); UVMHIST_LOG(ubchist, "pg %#jx", (uintptr_t)pg, 0,0,0); #if defined(VMSWAP) /* * for swap i/os, lock each page's object (or anon) * individually since each page may need a different lock. */ if (swap) { if (pg->uobject != NULL) { slock = pg->uobject->vmobjlock; } else { slock = pg->uanon->an_lock; } rw_enter(slock, RW_WRITER); anon_disposed = (pg->flags & PG_RELEASED) != 0; KASSERT(!anon_disposed || pg->uobject != NULL || pg->uanon->an_ref == 0); } #endif /* defined(VMSWAP) */ if (write && uobj != NULL) { KASSERT(uvm_obj_page_writeback_p(pg)); uvm_obj_page_clear_writeback(pg); } /* * process errors. for reads, just mark the page to be freed. * for writes, if the error was ENOMEM, we assume this was * a transient failure so we mark the page dirty so that * we'll try to write it again later. for all other write * errors, we assume the error is permanent, thus the data * in the page is lost. bummer. */ if (error) { int slot __unused; /* used for VMSWAP */ if (!write) { pg->flags |= PG_RELEASED; continue; } else if (error == ENOMEM) { if (pg->flags & PG_PAGEOUT) { pg->flags &= ~PG_PAGEOUT; pageout_done++; } uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); uvm_pagelock(pg); uvm_pageactivate(pg); uvm_pageunlock(pg); slot = 0; } else slot = SWSLOT_BAD; #if defined(VMSWAP) if (swap) { if (pg->uobject != NULL) { int oldslot __diagused; oldslot = uao_set_swslot(pg->uobject, pg->offset >> PAGE_SHIFT, slot); KASSERT(oldslot == swslot + i); } else { KASSERT(pg->uanon->an_swslot == swslot + i); pg->uanon->an_swslot = slot; } } #endif /* defined(VMSWAP) */ } /* * if the page is PG_FAKE, this must have been a read to * initialize the page. clear PG_FAKE and activate the page. */ if (pg->flags & PG_FAKE) { KASSERT(!write); pg->flags &= ~PG_FAKE; #if defined(READAHEAD_STATS) pg->flags |= PG_READAHEAD; uvm_ra_total.ev_count++; #endif /* defined(READAHEAD_STATS) */ KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN); uvm_pagelock(pg); uvm_pageenqueue(pg); uvm_pageunlock(pg); } #if defined(VMSWAP) /* * for swap pages, unlock everything for this page now. */ if (swap) { if (pg->uobject == NULL && anon_disposed) { uvm_anon_release(pg->uanon); } else { uvm_page_unbusy(&pg, 1); rw_exit(slock); } } #endif /* defined(VMSWAP) */ } if (pageout_done != 0) { uvm_pageout_done(pageout_done); } if (!swap) { uvm_page_unbusy(pgs, npages); rw_exit(slock); } else { #if defined(VMSWAP) KASSERT(write); /* these pages are now only in swap. */ if (error != ENOMEM) { atomic_add_int(&uvmexp.swpgonly, npages); } if (error) { if (error != ENOMEM) uvm_swap_markbad(swslot, npages); else uvm_swap_free(swslot, npages); } atomic_dec_uint(&uvmexp.pdpending); #endif /* defined(VMSWAP) */ } } /* * uvm_aio_aiodone: do iodone processing for async i/os. * this should be called in thread context, not interrupt context. */ void uvm_aio_aiodone(struct buf *bp) { const int npages = bp->b_bufsize >> PAGE_SHIFT; struct vm_page *pgs[howmany(MAXPHYS, MIN_PAGE_SIZE)]; int i, error; bool write; UVMHIST_FUNC(__func__); UVMHIST_CALLARGS(ubchist, "bp %#jx", (uintptr_t)bp, 0,0,0); KASSERT(bp->b_bufsize <= MAXPHYS); KASSERT(npages <= __arraycount(pgs)); error = bp->b_error; write = (bp->b_flags & B_READ) == 0; for (i = 0; i < npages; i++) { pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); UVMHIST_LOG(ubchist, "pgs[%jd] = %#jx", i, (uintptr_t)pgs[i], 0, 0); } uvm_pagermapout((vaddr_t)bp->b_data, npages); uvm_aio_aiodone_pages(pgs, npages, write, error); if (write && (bp->b_cflags & BC_AGE) != 0) { mutex_enter(bp->b_objlock); vwakeup(bp); mutex_exit(bp->b_objlock); } putiobuf(bp); } /* * uvm_pageratop: convert KVAs in the pager map back to their page * structures. */ struct vm_page * uvm_pageratop(vaddr_t kva) { struct vm_page *pg; paddr_t pa; bool rv __diagused; rv = pmap_extract(pmap_kernel(), kva, &pa); KASSERT(rv); pg = PHYS_TO_VM_PAGE(pa); KASSERT(pg != NULL); return (pg); }
2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 /* $NetBSD: usbdivar.h,v 1.138 2024/02/04 05:43:06 mrg Exp $ */ /* * Copyright (c) 1998, 2012 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology and Matthew R. Green (mrg@eterna23.net). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _DEV_USB_USBDIVAR_H_ #define _DEV_USB_USBDIVAR_H_ /* * Discussion about locking in the USB code: * * The host controller presents one lock at IPL_SOFTUSB (aka IPL_SOFTNET). * * List of hardware interface methods, and whether the lock is held * when each is called by this module: * * BUS METHOD LOCK NOTES * ----------------------- ------- ------------------------- * ubm_open - might want to take lock? * ubm_softint x may release/reacquire lock * ubm_dopoll - might want to take lock? * ubm_allocx - * ubm_freex - * ubm_abortx x must not release/reacquire lock * ubm_getlock - Called at attach time * ubm_newdev - Will take lock * ubm_rhctrl - * * PIPE METHOD LOCK NOTES * ----------------------- ------- ------------------------- * upm_init - * upm_fini - * upm_transfer x * upm_start x * upm_abort x * upm_close x * upm_cleartoggle - * upm_done x * * The above semantics are likely to change. Little performance * evaluation has been done on this code and the locking strategy. * * USB functions known to expect the lock taken include (this list is * probably not exhaustive): * usb_transfer_complete() * usb_start_next() * */ #include <sys/callout.h> #include <sys/mutex.h> #include <sys/bus.h> /* From usb_mem.h */ struct usb_dma_block; typedef struct { struct usb_dma_block *udma_block; u_int udma_offs; } usb_dma_t; struct usbd_xfer; struct usbd_pipe; struct usbd_port; struct usbd_endpoint { usb_endpoint_descriptor_t *ue_edesc; int ue_refcnt; int ue_toggle; }; struct usbd_bus_methods { usbd_status (*ubm_open)(struct usbd_pipe *); void (*ubm_softint)(void *); void (*ubm_dopoll)(struct usbd_bus *); struct usbd_xfer *(*ubm_allocx)(struct usbd_bus *, unsigned int); void (*ubm_freex)(struct usbd_bus *, struct usbd_xfer *); void (*ubm_abortx)(struct usbd_xfer *); bool (*ubm_dying)(struct usbd_bus *); void (*ubm_getlock)(struct usbd_bus *, kmutex_t **); usbd_status (*ubm_newdev)(device_t, struct usbd_bus *, int, int, int, struct usbd_port *); int (*ubm_rhctrl)(struct usbd_bus *, usb_device_request_t *, void *, int); }; struct usbd_pipe_methods { int (*upm_init)(struct usbd_xfer *); void (*upm_fini)(struct usbd_xfer *); usbd_status (*upm_transfer)(struct usbd_xfer *); usbd_status (*upm_start)(struct usbd_xfer *); void (*upm_abort)(struct usbd_xfer *); void (*upm_close)(struct usbd_pipe *); void (*upm_cleartoggle)(struct usbd_pipe *); void (*upm_done)(struct usbd_xfer *); }; struct usbd_tt { struct usbd_hub *utt_hub; }; struct usbd_port { usb_port_status_t up_status; uint16_t up_power; /* mA of current on port */ uint8_t up_portno; uint8_t up_restartcnt; #define USBD_RESTART_MAX 5 uint8_t up_reattach; struct usbd_device *up_dev; /* Connected device */ struct usbd_device *up_parent; /* The ports hub */ struct usbd_tt *up_tt; /* Transaction translator (if any) */ }; struct usbd_hub { usbd_status (*uh_explore)(struct usbd_device *hub); void *uh_hubsoftc; usb_hub_descriptor_t uh_hubdesc; struct usbd_port uh_ports[1]; }; /*****/ /* 0, root, and 1->127 */ #define USB_ROOTHUB_INDEX 1 #define USB_TOTAL_DEVICES (USB_MAX_DEVICES + 1) struct usbd_bus { /* Filled by HC driver */ void *ub_hcpriv; int ub_revision; /* USB revision */ #define USBREV_UNKNOWN 0 #define USBREV_PRE_1_0 1 #define USBREV_1_0 2 #define USBREV_1_1 3 #define USBREV_2_0 4 #define USBREV_3_0 5 #define USBREV_3_1 6 #define USBREV_STR { "unknown", "pre 1.0", "1.0", "1.1", "2.0", "3.0", "3.1" } int ub_hctype; #define USBHCTYPE_UNKNOWN 0 #define USBHCTYPE_MOTG 1 #define USBHCTYPE_OHCI 2 #define USBHCTYPE_UHCI 3 #define USBHCTYPE_EHCI 4 #define USBHCTYPE_XHCI 5 #define USBHCTYPE_VHCI 6 int ub_busnum; const struct usbd_bus_methods *ub_methods; uint32_t ub_pipesize; /* size of a pipe struct */ bool ub_usedma; /* Does this HC support DMA */ int ub_dmaflags; bus_dma_tag_t ub_dmatag; /* DMA tag */ /* Filled by usb driver */ kmutex_t *ub_lock; struct usbd_device *ub_roothub; struct usbd_xfer *ub_rhxfer; /* roothub xfer in progress */ kcondvar_t ub_rhxfercv; uint8_t ub_rhaddr; /* roothub address */ uint8_t ub_rhconf; /* roothub configuration */ struct usbd_device *ub_devices[USB_TOTAL_DEVICES]; kcondvar_t ub_needsexplore_cv; char ub_needsexplore;/* a hub a signalled a change */ char ub_usepolling; device_t ub_usbctl; struct usb_device_stats ub_stats; void *ub_soft; /* soft interrupt cookie */ }; struct usbd_device { struct usbd_bus *ud_bus; /* our controller */ struct usbd_pipe *ud_pipe0; /* pipe 0 */ uint8_t ud_addr; /* device address */ uint8_t ud_config; /* current configuration # */ uint8_t ud_depth; /* distance from root hub */ uint8_t ud_speed; /* low/full/high speed */ uint8_t ud_selfpowered; /* flag for self powered */ uint16_t ud_power; /* mA the device uses */ int16_t ud_langid; /* language for strings */ #define USBD_NOLANG (-1) usb_event_cookie_t ud_cookie; /* unique connection id */ struct usbd_port *ud_powersrc; /* upstream hub port, or 0 */ struct usbd_device *ud_myhub; /* upstream hub */ struct usbd_port *ud_myhsport; /* closest high speed port */ struct usbd_endpoint ud_ep0; /* for pipe 0 */ usb_endpoint_descriptor_t ud_ep0desc; /* for pipe 0 */ struct usbd_interface *ud_ifaces; /* array of all interfaces */ usb_device_descriptor_t ud_ddesc; /* device descriptor */ usb_config_descriptor_t *ud_cdesc; /* full config descr */ usb_bos_descriptor_t *ud_bdesc; /* full BOS descr */ const struct usbd_quirks *ud_quirks; /* device quirks, always set */ struct usbd_hub *ud_hub; /* only if this is a hub */ u_int ud_subdevlen; /* array length of following */ device_t *ud_subdevs; /* sub-devices */ int ud_nifaces_claimed; /* number of ifaces in use */ void *ud_hcpriv; char *ud_serial; /* serial number, can be NULL */ char *ud_vendor; /* vendor string, can be NULL */ char *ud_product; /* product string can be NULL */ }; struct usbd_interface { struct usbd_device *ui_dev; usb_interface_descriptor_t *ui_idesc; int ui_index; int ui_altindex; struct usbd_endpoint *ui_endpoints; int64_t ui_busy; /* #pipes, or -1 if setting */ }; struct usbd_pipe { struct usbd_interface *up_iface; struct usbd_device *up_dev; struct usbd_endpoint *up_endpoint; char up_running; char up_aborting; bool up_serialise; SIMPLEQ_HEAD(, usbd_xfer) up_queue; struct usb_task up_async_task; struct usbd_xfer *up_intrxfer; /* used for repeating requests */ char up_repeat; int up_interval; uint8_t up_flags; struct usbd_xfer *up_callingxfer; /* currently in callback */ kcondvar_t up_callingcv; struct lwp *up_abortlwp; /* lwp currently aborting */ /* Filled by HC driver. */ const struct usbd_pipe_methods *up_methods; }; struct usbd_xfer { struct usbd_pipe *ux_pipe; void *ux_priv; void *ux_buffer; kcondvar_t ux_cv; uint32_t ux_length; uint32_t ux_actlen; uint16_t ux_flags; uint32_t ux_timeout; usbd_status ux_status; usbd_callback ux_callback; volatile uint8_t ux_done; uint8_t ux_state; /* used for DIAGNOSTIC */ #define XFER_FREE 0x46 #define XFER_BUSY 0x55 #define XFER_ONQU 0x9e /* For control pipe */ usb_device_request_t ux_request; /* For isoc */ uint16_t *ux_frlengths; int ux_nframes; const struct usbd_pipe_methods *ux_methods; /* For memory allocation and softc */ struct usbd_bus *ux_bus; usb_dma_t ux_dmabuf; void *ux_buf; uint32_t ux_bufsize; uint8_t ux_rqflags; #define URQ_REQUEST 0x01 SIMPLEQ_ENTRY(usbd_xfer) ux_next; void *ux_hcpriv; /* private use by the HC driver */ struct usb_task ux_aborttask; struct callout ux_callout; /* * Protected by bus lock. * * - ux_timeout_set: The timeout is scheduled as a callout or * usb task, and has not yet acquired the bus lock. * * - ux_timeout_reset: The xfer completed, and was resubmitted * before the callout or task was able to acquire the bus * lock, so one or the other needs to schedule a new callout. */ bool ux_timeout_set; bool ux_timeout_reset; }; void usbd_init(void); void usbd_finish(void); #if defined(USB_DEBUG) void usbd_dump_iface(struct usbd_interface *); void usbd_dump_device(struct usbd_device *); void usbd_dump_endpoint(struct usbd_endpoint *); void usbd_dump_queue(struct usbd_pipe *); void usbd_dump_pipe(struct usbd_pipe *); #endif /* Routines from usb_subr.c */ int usbctlprint(void *, const char *); void usbd_get_device_strings(struct usbd_device *); void usb_delay_ms_locked(struct usbd_bus *, u_int, kmutex_t *); void usb_delay_ms(struct usbd_bus *, u_int); void usbd_delay_ms_locked(struct usbd_device *, u_int, kmutex_t *); void usbd_delay_ms(struct usbd_device *, u_int); usbd_status usbd_reset_port(struct usbd_device *, int, usb_port_status_t *); usbd_status usbd_setup_pipe(struct usbd_device *, struct usbd_interface *, struct usbd_endpoint *, int, struct usbd_pipe **); usbd_status usbd_setup_pipe_flags(struct usbd_device *, struct usbd_interface *, struct usbd_endpoint *, int, struct usbd_pipe **, uint8_t); usbd_status usbd_new_device(device_t, struct usbd_bus *, int, int, int, struct usbd_port *); usbd_status usbd_reattach_device(device_t, struct usbd_device *, int, const int *); void usbd_remove_device(struct usbd_device *, struct usbd_port *); bool usbd_iface_locked(struct usbd_interface *); usbd_status usbd_iface_lock(struct usbd_interface *); void usbd_iface_unlock(struct usbd_interface *); usbd_status usbd_iface_piperef(struct usbd_interface *); void usbd_iface_pipeunref(struct usbd_interface *); usbd_status usbd_fill_iface_data(struct usbd_device *, int, int); void usb_free_device(struct usbd_device *); void usb_transfer_complete(struct usbd_xfer *); int usb_disconnect_port(struct usbd_port *, device_t, int); usbd_status usbd_endpoint_acquire(struct usbd_device *, struct usbd_endpoint *, int); void usbd_endpoint_release(struct usbd_device *, struct usbd_endpoint *); void usbd_kill_pipe(struct usbd_pipe *); usbd_status usbd_attach_roothub(device_t, struct usbd_device *); usbd_status usbd_probe_and_attach(device_t, struct usbd_device *, int, int); /* Routines from usb.c */ void usb_needs_explore(struct usbd_device *); void usb_needs_reattach(struct usbd_device *); void usb_schedsoftintr(struct usbd_bus *); static __inline int usbd_xfer_isread(struct usbd_xfer *xfer) { if (xfer->ux_rqflags & URQ_REQUEST) return xfer->ux_request.bmRequestType & UT_READ; return xfer->ux_pipe->up_endpoint->ue_edesc->bEndpointAddress & UE_DIR_IN; } static __inline size_t usb_addr2dindex(int addr) { return USB_ROOTHUB_INDEX + addr; } /* * These macros reflect the current locking scheme. They might change. */ #define usbd_lock_pipe(p) mutex_enter((p)->up_dev->ud_bus->ub_lock) #define usbd_unlock_pipe(p) mutex_exit((p)->up_dev->ud_bus->ub_lock) #endif /* _DEV_USB_USBDIVAR_H_ */
2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 /* $NetBSD: hci_unit.c,v 1.16 2021/08/07 16:19:18 thorpej Exp $ */ /*- * Copyright (c) 2005 Iain Hibbert. * Copyright (c) 2006 Itronix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of Itronix Inc. may not be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: hci_unit.c,v 1.16 2021/08/07 16:19:18 thorpej Exp $"); #include <sys/param.h> #include <sys/conf.h> #include <sys/device.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/proc.h> #include <sys/queue.h> #include <sys/systm.h> #include <sys/intr.h> #include <sys/socketvar.h> #include <netbt/bluetooth.h> #include <netbt/hci.h> struct hci_unit_list hci_unit_list = SIMPLEQ_HEAD_INITIALIZER(hci_unit_list); MALLOC_DEFINE(M_BLUETOOTH, "Bluetooth", "Bluetooth System Memory"); /* * HCI Input Queue max lengths. */ int hci_eventq_max = 20; int hci_aclrxq_max = 50; int hci_scorxq_max = 50; /* * This is the default minimum command set supported by older * devices. Anything conforming to 1.2 spec or later will get * updated during init. */ static const uint8_t hci_cmds_v10[HCI_COMMANDS_SIZE] = { 0xff, 0xff, 0xff, 0x01, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 0x32, 0x03, 0xb8, 0xfe, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; /* * bluetooth unit functions */ static void hci_intr (void *); struct hci_unit * hci_attach_pcb(const struct hci_if *hci_if, device_t dev, uint16_t flags) { struct hci_unit *unit; KASSERT(dev != NULL); KASSERT(hci_if->enable != NULL); KASSERT(hci_if->disable != NULL); KASSERT(hci_if->output_cmd != NULL); KASSERT(hci_if->output_acl != NULL); KASSERT(hci_if->output_sco != NULL); KASSERT(hci_if->get_stats != NULL); unit = malloc(sizeof(struct hci_unit), M_BLUETOOTH, M_ZERO | M_WAITOK); KASSERT(unit != NULL); unit->hci_dev = dev; unit->hci_if = hci_if; unit->hci_flags = flags; mutex_init(&unit->hci_devlock, MUTEX_DRIVER, hci_if->ipl); cv_init(&unit->hci_init, "hci_init"); MBUFQ_INIT(&unit->hci_eventq); MBUFQ_INIT(&unit->hci_aclrxq); MBUFQ_INIT(&unit->hci_scorxq); MBUFQ_INIT(&unit->hci_cmdwait); MBUFQ_INIT(&unit->hci_scodone); TAILQ_INIT(&unit->hci_links); LIST_INIT(&unit->hci_memos); mutex_enter(bt_lock); SIMPLEQ_INSERT_TAIL(&hci_unit_list, unit, hci_next); mutex_exit(bt_lock); return unit; } void hci_detach_pcb(struct hci_unit *unit) { mutex_enter(bt_lock); hci_disable(unit); SIMPLEQ_REMOVE(&hci_unit_list, unit, hci_unit, hci_next); mutex_exit(bt_lock); cv_destroy(&unit->hci_init); mutex_destroy(&unit->hci_devlock); free(unit, M_BLUETOOTH); } int hci_enable(struct hci_unit *unit) { int err; /* * Bluetooth spec says that a device can accept one * command on power up until they send a Command Status * or Command Complete event with more information, but * it seems that some devices cant and prefer to send a * No-op Command Status packet when they are ready. */ unit->hci_num_cmd_pkts = (unit->hci_flags & BTF_POWER_UP_NOOP) ? 0 : 1; unit->hci_num_acl_pkts = 0; unit->hci_num_sco_pkts = 0; /* * only allow the basic packet types until * the features report is in */ unit->hci_acl_mask = HCI_PKT_DM1 | HCI_PKT_DH1; unit->hci_packet_type = unit->hci_acl_mask; memcpy(unit->hci_cmds, hci_cmds_v10, HCI_COMMANDS_SIZE); unit->hci_rxint = softint_establish(SOFTINT_NET, &hci_intr, unit); if (unit->hci_rxint == NULL) return EIO; err = (*unit->hci_if->enable)(unit->hci_dev); if (err) goto bad1; unit->hci_flags |= BTF_RUNNING; /* * Reset the device, this will trigger initialisation * and wake us up. */ unit->hci_flags |= BTF_INIT; err = hci_send_cmd(unit, HCI_CMD_RESET, NULL, 0); if (err) goto bad2; while (unit->hci_flags & BTF_INIT) { err = cv_timedwait_sig(&unit->hci_init, bt_lock, 5 * hz); if (err) goto bad2; /* XXX * "What If", while we were sleeping, the device * was removed and detached? Ho Hum. */ } /* * Attach Bluetooth Device Hub */ unit->hci_bthub = config_found(unit->hci_dev, &unit->hci_bdaddr, NULL, CFARGS(.iattr = "btbus")); return 0; bad2: (*unit->hci_if->disable)(unit->hci_dev); unit->hci_flags &= ~BTF_RUNNING; bad1: softint_disestablish(unit->hci_rxint); unit->hci_rxint = NULL; return err; } void hci_disable(struct hci_unit *unit) { struct hci_link *link, *next; struct hci_memo *memo; int acl; if (unit->hci_bthub) { device_t hub; hub = unit->hci_bthub; unit->hci_bthub = NULL; mutex_exit(bt_lock); config_detach(hub, DETACH_FORCE); mutex_enter(bt_lock); } if (unit->hci_rxint) { softint_disestablish(unit->hci_rxint); unit->hci_rxint = NULL; } (*unit->hci_if->disable)(unit->hci_dev); unit->hci_flags &= ~BTF_RUNNING; /* * close down any links, take care to close SCO first since * they may depend on ACL links. */ for (acl = 0 ; acl < 2 ; acl++) { next = TAILQ_FIRST(&unit->hci_links); while ((link = next) != NULL) { next = TAILQ_NEXT(link, hl_next); if (acl || link->hl_type != HCI_LINK_ACL) hci_link_free(link, ECONNABORTED); } } while ((memo = LIST_FIRST(&unit->hci_memos)) != NULL) hci_memo_free(memo); /* (no need to hold hci_devlock, the driver is disabled) */ MBUFQ_DRAIN(&unit->hci_eventq); unit->hci_eventqlen = 0; MBUFQ_DRAIN(&unit->hci_aclrxq); unit->hci_aclrxqlen = 0; MBUFQ_DRAIN(&unit->hci_scorxq); unit->hci_scorxqlen = 0; MBUFQ_DRAIN(&unit->hci_cmdwait); MBUFQ_DRAIN(&unit->hci_scodone); } struct hci_unit * hci_unit_lookup(const bdaddr_t *addr) { struct hci_unit *unit; SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) { if ((unit->hci_flags & BTF_UP) == 0) continue; if (bdaddr_same(&unit->hci_bdaddr, addr)) break; } return unit; } /* * update num_cmd_pkts and push on pending commands queue */ void hci_num_cmds(struct hci_unit *unit, uint8_t num) { struct mbuf *m; unit->hci_num_cmd_pkts = num; while (unit->hci_num_cmd_pkts > 0 && MBUFQ_FIRST(&unit->hci_cmdwait)) { MBUFQ_DEQUEUE(&unit->hci_cmdwait, m); hci_output_cmd(unit, m); } } /* * construct and queue a HCI command packet */ int hci_send_cmd(struct hci_unit *unit, uint16_t opcode, void *buf, uint8_t len) { struct mbuf *m; hci_cmd_hdr_t *p; KASSERT(unit != NULL); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) return ENOMEM; p = mtod(m, hci_cmd_hdr_t *); p->type = HCI_CMD_PKT; p->opcode = htole16(opcode); p->length = len; m->m_pkthdr.len = m->m_len = sizeof(hci_cmd_hdr_t); if (len) { KASSERT(buf != NULL); m_copyback(m, sizeof(hci_cmd_hdr_t), len, buf); if (m->m_pkthdr.len != (sizeof(hci_cmd_hdr_t) + len)) { m_freem(m); return ENOMEM; } } DPRINTFN(2, "(%s) opcode (%3.3x|%4.4x)\n", device_xname(unit->hci_dev), HCI_OGF(opcode), HCI_OCF(opcode)); /* and send it on */ if (unit->hci_num_cmd_pkts == 0) MBUFQ_ENQUEUE(&unit->hci_cmdwait, m); else hci_output_cmd(unit, m); return 0; } /* * Incoming packet processing. Since the code is single threaded * in any case (IPL_SOFTNET), we handle it all in one interrupt function * picking our way through more important packets first so that hopefully * we will never get clogged up with bulk data. */ static void hci_intr(void *arg) { struct hci_unit *unit = arg; struct mbuf *m; mutex_enter(bt_lock); another: mutex_enter(&unit->hci_devlock); if (unit->hci_eventqlen > 0) { MBUFQ_DEQUEUE(&unit->hci_eventq, m); unit->hci_eventqlen--; mutex_exit(&unit->hci_devlock); KASSERT(m != NULL); DPRINTFN(10, "(%s) recv event, len = %d\n", device_xname(unit->hci_dev), m->m_pkthdr.len); m->m_flags |= M_LINK0; /* mark incoming packet */ hci_mtap(m, unit); hci_event(m, unit); goto another; } if (unit->hci_scorxqlen > 0) { MBUFQ_DEQUEUE(&unit->hci_scorxq, m); unit->hci_scorxqlen--; mutex_exit(&unit->hci_devlock); KASSERT(m != NULL); DPRINTFN(10, "(%s) recv SCO, len = %d\n", device_xname(unit->hci_dev), m->m_pkthdr.len); m->m_flags |= M_LINK0; /* mark incoming packet */ hci_mtap(m, unit); hci_sco_recv(m, unit); goto another; } if (unit->hci_aclrxqlen > 0) { MBUFQ_DEQUEUE(&unit->hci_aclrxq, m); unit->hci_aclrxqlen--; mutex_exit(&unit->hci_devlock); KASSERT(m != NULL); DPRINTFN(10, "(%s) recv ACL, len = %d\n", device_xname(unit->hci_dev), m->m_pkthdr.len); m->m_flags |= M_LINK0; /* mark incoming packet */ hci_mtap(m, unit); hci_acl_recv(m, unit); goto another; } MBUFQ_DEQUEUE(&unit->hci_scodone, m); if (m != NULL) { struct hci_link *link; mutex_exit(&unit->hci_devlock); DPRINTFN(11, "(%s) complete SCO\n", device_xname(unit->hci_dev)); TAILQ_FOREACH(link, &unit->hci_links, hl_next) { if (link == M_GETCTX(m, struct hci_link *)) { hci_sco_complete(link, 1); break; } } unit->hci_num_sco_pkts++; m_freem(m); goto another; } mutex_exit(&unit->hci_devlock); mutex_exit(bt_lock); DPRINTFN(10, "done\n"); } /********************************************************************** * * IO routines * * input & complete routines will be called from device drivers, * possibly in interrupt context. We return success or failure to * enable proper accounting but we own the mbuf. */ bool hci_input_event(struct hci_unit *unit, struct mbuf *m) { bool rv; mutex_enter(&unit->hci_devlock); if (unit->hci_eventqlen > hci_eventq_max || unit->hci_rxint == NULL) { DPRINTF("(%s) dropped event packet.\n", device_xname(unit->hci_dev)); m_freem(m); rv = false; } else { unit->hci_eventqlen++; MBUFQ_ENQUEUE(&unit->hci_eventq, m); softint_schedule(unit->hci_rxint); rv = true; } mutex_exit(&unit->hci_devlock); return rv; } bool hci_input_acl(struct hci_unit *unit, struct mbuf *m) { bool rv; mutex_enter(&unit->hci_devlock); if (unit->hci_aclrxqlen > hci_aclrxq_max || unit->hci_rxint == NULL) { DPRINTF("(%s) dropped ACL packet.\n", device_xname(unit->hci_dev)); m_freem(m); rv = false; } else { unit->hci_aclrxqlen++; MBUFQ_ENQUEUE(&unit->hci_aclrxq, m); softint_schedule(unit->hci_rxint); rv = true; } mutex_exit(&unit->hci_devlock); return rv; } bool hci_input_sco(struct hci_unit *unit, struct mbuf *m) { bool rv; mutex_enter(&unit->hci_devlock); if (unit->hci_scorxqlen > hci_scorxq_max || unit->hci_rxint == NULL) { DPRINTF("(%s) dropped SCO packet.\n", device_xname(unit->hci_dev)); m_freem(m); rv = false; } else { unit->hci_scorxqlen++; MBUFQ_ENQUEUE(&unit->hci_scorxq, m); softint_schedule(unit->hci_rxint); rv = true; } mutex_exit(&unit->hci_devlock); return rv; } void hci_output_cmd(struct hci_unit *unit, struct mbuf *m) { void *arg; hci_mtap(m, unit); DPRINTFN(10, "(%s) num_cmd_pkts=%d\n", device_xname(unit->hci_dev), unit->hci_num_cmd_pkts); unit->hci_num_cmd_pkts--; /* * If context is set, this was from a HCI raw socket * and a record needs to be dropped from the sockbuf. */ arg = M_GETCTX(m, void *); if (arg != NULL) hci_drop(arg); (*unit->hci_if->output_cmd)(unit->hci_dev, m); } void hci_output_acl(struct hci_unit *unit, struct mbuf *m) { hci_mtap(m, unit); DPRINTFN(10, "(%s) num_acl_pkts=%d\n", device_xname(unit->hci_dev), unit->hci_num_acl_pkts); unit->hci_num_acl_pkts--; (*unit->hci_if->output_acl)(unit->hci_dev, m); } void hci_output_sco(struct hci_unit *unit, struct mbuf *m) { hci_mtap(m, unit); DPRINTFN(10, "(%s) num_sco_pkts=%d\n", device_xname(unit->hci_dev), unit->hci_num_sco_pkts); unit->hci_num_sco_pkts--; (*unit->hci_if->output_sco)(unit->hci_dev, m); } bool hci_complete_sco(struct hci_unit *unit, struct mbuf *m) { if (unit->hci_rxint == NULL) { DPRINTFN(10, "(%s) complete SCO!\n", device_xname(unit->hci_dev)); m_freem(m); return false; } mutex_enter(&unit->hci_devlock); MBUFQ_ENQUEUE(&unit->hci_scodone, m); softint_schedule(unit->hci_rxint); mutex_exit(&unit->hci_devlock); return true; }
39 1 4 4 38 9 34 34 34 34 34 34 34 34 34 34 34 34 14 34 14 34 58 57 6 58 58 6 53 4 4 4 54 6 43 5 38 58 58 48 48 47 48 48 47 1 46 2 48 45 13 42 3 8 23 19 37 35 39 21 14 8 24 24 34 2 1 1 1 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 /* $NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $ */ /* $KAME: in6_src.c,v 1.159 2005/10/19 01:40:32 t-momose Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $"); #ifdef _KERNEL_OPT #include "opt_inet.h" #endif #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/ioctl.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/kernel.h> #include <sys/proc.h> #include <sys/kauth.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/portalgo.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/ip6_private.h> #include <netinet6/nd6.h> #include <netinet6/scope6_var.h> #ifdef MIP6 #include <netinet6/mip6.h> #include <netinet6/mip6_var.h> #include "mip.h" #if NMIP > 0 #include <net/if_mip.h> #endif /* NMIP > 0 */ #endif /* MIP6 */ #include <netinet/tcp_vtw.h> #define ADDR_LABEL_NOTAPP (-1) struct in6_addrpolicy defaultaddrpolicy; int ip6_prefer_tempaddr = 0; static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route *, struct ifnet **, struct psref *); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *), void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); #define IFA6_IS_VALIDATED(ia) \ (((ia)->ia6_flags & (IN6_IFF_TENTATIVE | IN6_IFF_DETACHED)) == 0) /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #if 0 /* disabled ad-hoc */ #define REPLACE(r) do {\ char _buf1[INET6_ADDRSTRLEN], _buf2[INET6_ADDRSTRLEN]; \ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ printf("%s: replace %s with %s by %d\n", __func__, ia_best ? \ IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \ IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \ goto replace; \ } while(/*CONSTCOND*/0) #define NEXT(r) do {\ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ printf("%s: keep %s against %s by %d\n", ia_best ? \ IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \ IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \ goto next; /* XXX: we can't use 'continue' here */ \ } while(/*CONSTCOND*/0) #define BREAK(r) do { \ if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ ip6stat.ip6s_sources_rule[(r)]++; \ goto out; /* XXX: we can't use 'break' here */ \ } while(/*CONSTCOND*/0) #else #define REPLACE(r) goto replace #define NEXT(r) goto next #define BREAK(r) goto out #endif /* * Called inside pserialize critical section. Don't sleep/block. */ static struct in6_ifaddr * in6_select_best_ia(struct sockaddr_in6 *dstsock, struct in6_addr *dst, const struct ifnet *ifp, const struct ip6_pktopts *opts, const u_int32_t odstzone) { struct in6_ifaddr *ia, *ia_best = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; IN6_ADDRLIST_READER_FOREACH(ia) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; int prefer_tempaddr; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; /* Skip the scope test in impossible cases */ if (!(ifp->if_flags & IFF_LOOPBACK) && IN6_IS_ADDR_LOOPBACK(&src)) continue; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_ANYCAST))) continue; if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; #if defined(MIP6) && NMIP > 0 /* avoid unusable home addresses. */ if ((ia->ia6_flags & IN6_IFF_HOME) && !mip6_ifa6_is_addr_valid_hoa(ia)) continue; #endif /* MIP6 && NMIP > 0 */ /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(1); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. * Treat unvalidated addresses as deprecated here. */ if (IFA6_IS_VALIDATED(ia_best) && !IFA6_IS_VALIDATED(ia)) NEXT(3); if (!IFA6_IS_VALIDATED(ia_best) && IFA6_IS_VALIDATED(ia)) REPLACE(3); if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ #if defined(MIP6) && NMIP > 0 if (!MIP6_IS_MN) goto skip_rule4; if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { /* both address are not home addresses. */ goto skip_rule4; } /* * If SA is simultaneously a home address and care-of * address and SB is not, then prefer SA. Similarly, * if SB is simultaneously a home address and care-of * address and SA is not, then prefer SB. */ if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && ia_best->ia_ifp->if_type != IFT_MIP) && ((ia->ia6_flags & IN6_IFF_HOME) != 0 && ia->ia_ifp->if_type == IFT_MIP)) NEXT(4); if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && ia_best->ia_ifp->if_type == IFT_MIP) && ((ia->ia6_flags & IN6_IFF_HOME) != 0 && ia->ia_ifp->if_type != IFT_MIP)) REPLACE(4); if (ip6po_usecoa == 0) { /* * If SA is just a home address and SB is just * a care-of address, then prefer * SA. Similarly, if SB is just a home address * and SA is just a care-of address, then * prefer SB. */ if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { NEXT(4); } if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) != 0) { REPLACE(4); } } else { /* * a sender don't want to use a home address * because: * * 1) we cannot use. (ex. NS or NA to global * addresses.) * * 2) a user specified not to use. * (ex. mip6control -u) */ if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 && (ia->ia6_flags & IN6_IFF_HOME) != 0) { /* XXX breaks stat */ NEXT(0); } if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 && (ia->ia6_flags & IN6_IFF_HOME) == 0) { /* XXX breaks stat */ REPLACE(0); } } skip_rule4: #endif /* MIP6 && NMIP > 0 */