| 98 98 98 98 98 98 98 75 75 75 56 56 56 56 56 56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, SUSE. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include "protocol.h" static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : mptcp_subflow_get_send(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static struct mptcp_sched_ops mptcp_sched_default = { .get_subflow = mptcp_sched_default_get_subflow, .name = "default", .owner = THIS_MODULE, }; /* Must be called with rcu read lock held */ struct mptcp_sched_ops *mptcp_sched_find(const char *name) { struct mptcp_sched_ops *sched, *ret = NULL; list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { if (!strcmp(sched->name, name)) { ret = sched; break; } } return ret; } int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { if (!sched->get_subflow) return -EINVAL; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { spin_unlock(&mptcp_sched_list_lock); return -EEXIST; } list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); pr_debug("%s registered", sched->name); return 0; } void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { if (sched == &mptcp_sched_default) return; spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } void mptcp_sched_init(void) { mptcp_register_scheduler(&mptcp_sched_default); } int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched) { if (!sched) sched = &mptcp_sched_default; if (!bpf_try_module_get(sched, sched->owner)) return -EBUSY; msk->sched = sched; if (msk->sched->init) msk->sched->init(msk); pr_debug("sched=%s", msk->sched->name); return 0; } void mptcp_release_sched(struct mptcp_sock *msk) { struct mptcp_sched_ops *sched = msk->sched; if (!sched) return; msk->sched = NULL; if (sched->release) sched->release(msk); bpf_module_put(sched, sched->owner); } void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled) { WRITE_ONCE(subflow->scheduled, scheduled); } int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_send */ if (__mptcp_check_fallback(msk)) { if (msk->first && __tcp_can_send(msk->first) && sk_stream_memory_free(msk->first)) { mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); return 0; } return -EINVAL; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_retrans */ if (__mptcp_check_fallback(msk)) return -EINVAL; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } |
| 4854 1767 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <uapi/linux/rseq.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/livepatch_sched.h> #include <linux/uidgid_types.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ debug_special_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(¤t->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(¤t->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(¤t->pi_lock); \ debug_rtlock_wait_restore_state(); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(¤t->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; struct list_head group_node; unsigned int on_rq; u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; s64 vlag; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif #ifdef CONFIG_SMP /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; #endif }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for * * @server_has_tasks() returns true if @server_pick return a * runnable task. */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP unsigned short migration_disabled; #endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. * If we find justification for more monitors, we can think * about adding more or developing a dynamic method. So far, * none of these are justified. */ union rv_task_monitor rv[RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. */ if (tsk_state & TASK_RTLOCK_WAIT) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF__HOLE__20000000 0x20000000 #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); #else return true; #endif } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); #ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { if (src->user_cpus_ptr) return -EINVAL; return 0; } static inline void release_user_cpus_ptr(struct task_struct *p) { WARN_ON(p->user_cpus_ptr); } static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { return 0; } #endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #elif !defined(__HAVE_THREAD_FUNCTIONS) # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); static inline void set_task_comm(struct task_struct *tsk, const char *from) { __set_task_comm(tsk, from, false); } extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ __get_task_comm(buf, sizeof(buf), tsk); \ }) #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } #else static inline void scheduler_ipi(void) { } #endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) void sched_dynamic_klp_enable(void); void sched_dynamic_klp_disable(void); DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { klp_sched_try_switch(); return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { klp_sched_try_switch(); return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_rt(); } static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); #include <linux/spinlock.h> /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif #ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #endif |
| 8 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 | /* SPDX-License-Identifier: GPL-2.0 */ /* interrupt.h */ #ifndef _LINUX_INTERRUPT_H #define _LINUX_INTERRUPT_H #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/workqueue.h> #include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/sections.h> /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When * requesting an interrupt without specifying a IRQF_TRIGGER, the * setting should be assumed to be "as already configured", which * may be as per machine or firmware initialisation. */ #define IRQF_TRIGGER_NONE 0x00000000 #define IRQF_TRIGGER_RISING 0x00000001 #define IRQF_TRIGGER_FALLING 0x00000002 #define IRQF_TRIGGER_HIGH 0x00000004 #define IRQF_TRIGGER_LOW 0x00000008 #define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \ IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING) #define IRQF_TRIGGER_PROBE 0x00000010 /* * These flags used only by the kernel as part of the * irq handling routines. * * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt * IRQF_PERCPU - Interrupt is per cpu * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in a shared interrupt is considered for * performance reasons) * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this * interrupt handler after suspending interrupts. For system * wakeup devices users need to implement wakeup detection in * their interrupt handlers. * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it. * Users will enable it explicitly by enable_irq() or enable_nmi() * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. * * IRQC_IS_HARDIRQ - interrupt runs in hardirq context * IRQC_IS_NESTED - interrupt runs in a nested threaded context */ enum { IRQC_IS_HARDIRQ = 0, IRQC_IS_NESTED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); /** * struct irqaction - per interrupt action descriptor * @handler: interrupt handler function * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) * @thread_fn: interrupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @secondary: pointer to secondary irqaction (force threading) * @thread_flags: flags related to @thread * @thread_mask: bitmask for keeping track of @thread activity * @dir: pointer to the proc/irq/NN/name entry */ struct irqaction { irq_handler_t handler; void *dev_id; void __percpu *percpu_dev_id; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; struct irqaction *secondary; unsigned int irq; unsigned int flags; unsigned long thread_flags; unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we * can distingiush that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. */ #define IRQ_NOTCONNECTED (1U << 31) extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags, const char *name, void *dev); /** * request_irq - Add a handler for an interrupt line * @irq: The interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts * If NULL, the default primary handler is installed * @flags: Handling flags * @name: Name of the device generating this interrupt * @dev: A cookie passed to the handler function * * This call allocates an interrupt and establishes a handler; see * the documentation for request_threaded_irq() for details. */ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags, name, dev); } extern int __must_check request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, percpu_dev_id); } extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *dev); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); extern const void *free_nmi(unsigned int irq, void *dev_id); extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id); struct device; extern int __must_check devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id); static inline int __must_check devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, devname, dev_id); } extern int __must_check devm_request_any_context_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); extern void enable_percpu_nmi(unsigned int irq, unsigned int type); extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); extern int irq_inject_interrupt(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. * @release: Function to be called on release. This will be * called in process context. Once registered, the * structure must only be freed when this function is * called or later. */ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; #define IRQ_AFFINITY_MAX_SETS 4 /** * struct irq_affinity - Description for automatic irq affinity assignements * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space * @nr_sets: The number of interrupt sets for which affinity * spreading is required * @set_size: Array holding the size of each interrupt set * @calc_sets: Callback for calculating the number and size * of interrupt sets * @priv: Private data for usage by @calc_sets, usually a * pointer to driver/device specific data. */ struct irq_affinity { unsigned int pre_vectors; unsigned int post_vectors; unsigned int nr_sets; unsigned int set_size[IRQ_AFFINITY_MAX_SETS]; void (*calc_sets)(struct irq_affinity *, unsigned int nvecs); void *priv; }; /** * struct irq_affinity_desc - Interrupt affinity descriptor * @mask: cpumask to hold the affinity assignment * @is_managed: 1 if the interrupt is managed internally */ struct irq_affinity_desc { struct cpumask mask; unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity); /** * irq_update_affinity_hint - Update the affinity hint * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint, but does not change the affinity of the interrupt. */ static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, false); } /** * irq_set_affinity_and_hint - Update the affinity hint and apply the provided * cpumask to the interrupt * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint and if @m is not NULL it applies it as the * affinity of that interrupt. */ static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, true); } /* * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() * instead. */ static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return irq_set_affinity_and_hint(irq, m); } extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd); unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return 0; } static inline int irq_can_set_affinity(unsigned int irq) { return 0; } static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { return -EINVAL; } static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } static inline struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd) { return NULL; } static inline unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd) { return maxvec; } #endif /* CONFIG_SMP */ /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that * know that a particular irq context which is disabled, * and which is the only irq-context user of a lock, * that it's safe to take the lock in the irq-disabled * section without disabling hardirqs. * * On !CONFIG_LOCKDEP they are equivalent to the normal * irq disable/enable methods. */ static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_save(*flags); #endif } static inline void disable_irq_lockdep(unsigned int irq) { disable_irq(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void enable_irq_lockdep(unsigned int irq) { #ifdef CONFIG_LOCKDEP local_irq_enable(); #endif enable_irq(irq); } static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { #ifdef CONFIG_LOCKDEP local_irq_restore(*flags); #endif enable_irq(irq); } /* IRQ wakeup (PM) control: */ extern int irq_set_irq_wake(unsigned int irq, unsigned int on); static inline int enable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 0); } /* * irq_get_irqchip_state/irq_set_irqchip_state specific flags */ enum irqchip_irq_state { IRQCHIP_STATE_PENDING, /* Is interrupt pending? */ IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */ IRQCHIP_STATE_MASKED, /* Is interrupt masked? */ IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */ }; extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state); extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool state); #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else #define force_irqthreads() (false) #endif #ifndef local_softirq_pending #ifndef local_softirq_pending_ref #define local_softirq_pending_ref irq_stat.__softirq_pending #endif #define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref)) #define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x))) #define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x))) #endif /* local_softirq_pending */ /* Some architectures might implement lazy enabling/disabling of * interrupts. In some cases, such as stop_machine, we might want * to ensure that after a local_irq_disable(), interrupts have * really been disabled in hardware. Such architectures need to * implement the following hook. */ #ifndef hard_irq_disable #define hard_irq_disable() do { } while(0) #endif /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; /* * The following vectors can be safely ignored after ksoftirqd is parked: * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue * * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. */ extern const char * const softirq_to_name[NR_SOFTIRQS]; /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ struct softirq_action { void (*action)(struct softirq_action *); }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); #ifdef CONFIG_PREEMPT_RT extern void do_softirq_post_smp_call_flush(unsigned int was_pending); #else static inline void do_softirq_post_smp_call_flush(unsigned int unused) { do_softirq(); } #endif extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) { return this_cpu_read(ksoftirqd); } /* Tasklets --- multithreaded analogue of BHs. This API is deprecated. Please consider using threaded IRQs instead: https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. Main feature differing them of BHs: different tasklets may be run simultaneously on different CPUs. Properties: * If tasklet_schedule() is called, then tasklet is guaranteed to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its execution is still not started, it will be executed only once. * If this tasklet is already running on another CPU (or schedule is called from tasklet itself), it is rescheduled for later. * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. */ struct tasklet_struct { struct tasklet_struct *next; unsigned long state; atomic_t count; bool use_callback; union { void (*func)(unsigned long data); void (*callback)(struct tasklet_struct *t); }; unsigned long data; }; #define DECLARE_TASKLET(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .callback = _callback, \ .use_callback = true, \ } #define DECLARE_TASKLET_DISABLED(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .callback = _callback, \ .use_callback = true, \ } #define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ container_of(callback_tasklet, typeof(*var), tasklet_fieldname) #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .func = _func, \ } #define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .func = _func, \ } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } void tasklet_unlock(struct tasklet_struct *t); void tasklet_unlock_wait(struct tasklet_struct *t); void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } static inline void tasklet_unlock(struct tasklet_struct *t) { } static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); static inline void tasklet_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_schedule(t); } extern void __tasklet_hi_schedule(struct tasklet_struct *t); static inline void tasklet_hi_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_hi_schedule(t); } static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); smp_mb__after_atomic(); } /* * Do not use in new code. Disabling tasklets from atomic contexts is * error prone and should be avoided. */ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_spin_wait(t); smp_mb(); } static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_wait(t); smp_mb(); } static inline void tasklet_enable(struct tasklet_struct *t) { smp_mb__before_atomic(); atomic_dec(&t->count); } extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); extern void tasklet_setup(struct tasklet_struct *t, void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: * * probe_irq_on() and probe_irq_off() provide robust primitives * for accurate IRQ probing during kernel initialization. They are * reasonably simple to use, are not "fooled" by spurious interrupts, * and, unlike other attempts at IRQ probing, they do not get hung on * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards). * * For reasonably foolproof probing, use them as follows: * * 1. clear and/or mask the device's internal interrupt. * 2. sti(); * 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs * 4. enable the device and cause it to trigger an interrupt. * 5. wait for the device to interrupt, using non-intrusive polling or a delay. * 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple * 7. service the device to clear its pending interrupt. * 8. loop again if paranoia is required. * * probe_irq_on() returns a mask of allocated irq's. * * probe_irq_off() takes the mask as a parameter, * and returns the irq number which occurred, * or zero if none occurred, or a negative irq number * if more than one irq occurred. */ #if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; } static inline int probe_irq_off(unsigned long val) { return 0; } static inline unsigned int probe_irq_mask(unsigned long val) { return 0; } #else extern unsigned long probe_irq_on(void); /* returns 0 on failure */ extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */ extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif #ifdef CONFIG_PROC_FS /* Initialize /proc/irq/ */ extern void init_irq_proc(void); #else static inline void init_irq_proc(void) { } #endif #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); u64 irq_timings_next_event(u64 now); #endif struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry # define __irq_entry __section(".irqentry.text") #endif #define __softirq_entry __section(".softirqentry.text") #endif |
| 2048 314 75 3289 4342 8656 789 185 185 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Operations on the network namespace */ #ifndef __NET_NET_NAMESPACE_H #define __NET_NET_NAMESPACE_H #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/list.h> #include <linux/sysctl.h> #include <linux/uidgid.h> #include <net/flow.h> #include <net/netns/core.h> #include <net/netns/mib.h> #include <net/netns/unix.h> #include <net/netns/packet.h> #include <net/netns/ipv4.h> #include <net/netns/ipv6.h> #include <net/netns/nexthop.h> #include <net/netns/ieee802154_6lowpan.h> #include <net/netns/sctp.h> #include <net/netns/netfilter.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netns/conntrack.h> #endif #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) #include <net/netns/flow_table.h> #endif #include <net/netns/nftables.h> #include <net/netns/xfrm.h> #include <net/netns/mpls.h> #include <net/netns/can.h> #include <net/netns/xdp.h> #include <net/netns/smc.h> #include <net/netns/bpf.h> #include <net/netns/mctp.h> #include <net/net_trackers.h> #include <linux/ns_common.h> #include <linux/idr.h> #include <linux/skbuff.h> #include <linux/notifier.h> #include <linux/xarray.h> struct user_namespace; struct proc_dir_entry; struct net_device; struct sock; struct ctl_table_header; struct net_generic; struct uevent_sock; struct netns_ipvs; struct bpf_prog; #define NETDEV_HASHBITS 8 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { /* First cache line can be often dirtied. * Do not place here read-mostly fields. */ refcount_t passive; /* To decide when the network * namespace should be freed. */ spinlock_t rules_mod_lock; atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit * methods on dead net ( * pernet_ops_rwsem read locked), * or to unregister pernet ops * (pernet_ops_rwsem write locked). */ struct llist_node cleanup_list; /* namespaces on death row */ #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; struct idr netns_ids; struct ns_common ns; struct ref_tracker_dir refcnt_tracker; struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not * refcounted against netns */ struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; #ifdef CONFIG_SYSCTL struct ctl_table_set sysctls; #endif struct sock *rtnl; /* rtnetlink socket */ struct sock *genl_sock; struct uevent_sock *uevent_sock; /* uevent socket */ struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, * it is critical that it is on a read_mostly cache line. */ u32 hash_mix; struct net_device *loopback_dev; /* The loopback */ /* core fib_rules */ struct list_head rules_ops; struct netns_core core; struct netns_mib mib; struct netns_packet packet; #if IS_ENABLED(CONFIG_UNIX) struct netns_unix unx; #endif struct netns_nexthop nexthop; struct netns_ipv4 ipv4; #if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6; #endif #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct netns_ieee802154_lowpan ieee802154_lowpan; #endif #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp; #endif #ifdef CONFIG_NETFILTER struct netns_nf nf; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) struct netns_ft ft; #endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; #endif struct net_generic __rcu *gen; /* Used to store attached BPF programs */ struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif u64 net_cookie; /* written once */ #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; #endif #if IS_ENABLED(CONFIG_MPLS) struct netns_mpls mpls; #endif #if IS_ENABLED(CONFIG_CAN) struct netns_can can; #endif #ifdef CONFIG_XDP_SOCKETS struct netns_xdp xdp; #endif #if IS_ENABLED(CONFIG_MCTP) struct netns_mctp mctp; #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif } __randomize_layout; #include <linux/seq_file_net.h> /* Init's network namespace */ extern struct net init_net; #ifdef CONFIG_NET_NS struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); struct net *get_net_ns_by_fd(int fd); #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> static inline struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) return ERR_PTR(-EINVAL); return old_net; } static inline void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; } static inline void net_ns_barrier(void) {} static inline struct ns_common *get_net_ns(struct ns_common *ns) { return ERR_PTR(-EINVAL); } static inline struct net *get_net_ns_by_fd(int fd) { return ERR_PTR(-EINVAL); } #endif /* CONFIG_NET_NS */ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); #else #define ipx_register_sysctl() #define ipx_unregister_sysctl() #endif #ifdef CONFIG_NET_NS void __put_net(struct net *net); /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { refcount_inc(&net->ns.count); return net; } static inline struct net *maybe_get_net(struct net *net) { /* Used when we know struct net exists but we * aren't guaranteed a previous reference count * exists. If the reference count is zero this * function fails and returns NULL. */ if (!refcount_inc_not_zero(&net->ns.count)) net = NULL; return net; } /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { if (refcount_dec_and_test(&net->ns.count)) __put_net(net); } static inline int net_eq(const struct net *net1, const struct net *net2) { return net1 == net2; } static inline int check_net(const struct net *net) { return refcount_read(&net->ns.count) != 0; } void net_drop_ns(void *); #else static inline struct net *get_net(struct net *net) { return net; } static inline void put_net(struct net *net) { } static inline struct net *maybe_get_net(struct net *net) { return net; } static inline int net_eq(const struct net *net1, const struct net *net2) { return 1; } static inline int check_net(const struct net *net) { return 1; } #define net_drop_ns NULL #endif static inline void __netns_tracker_alloc(struct net *net, netns_tracker *tracker, bool refcounted, gfp_t gfp) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER ref_tracker_alloc(refcounted ? &net->refcnt_tracker : &net->notrefcnt_tracker, tracker, gfp); #endif } static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, gfp_t gfp) { __netns_tracker_alloc(net, tracker, true, gfp); } static inline void __netns_tracker_free(struct net *net, netns_tracker *tracker, bool refcounted) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER ref_tracker_free(refcounted ? &net->refcnt_tracker : &net->notrefcnt_tracker, tracker); #endif } static inline struct net *get_net_track(struct net *net, netns_tracker *tracker, gfp_t gfp) { get_net(net); netns_tracker_alloc(net, tracker, gfp); return net; } static inline void put_net_track(struct net *net, netns_tracker *tracker) { __netns_tracker_free(net, tracker, true); put_net(net); } typedef struct { #ifdef CONFIG_NET_NS struct net __rcu *net; #endif } possible_net_t; static inline void write_pnet(possible_net_t *pnet, struct net *net) { #ifdef CONFIG_NET_NS rcu_assign_pointer(pnet->net, net); #endif } static inline struct net *read_pnet(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference_protected(pnet->net, true); #else return &init_net; #endif } static inline struct net *read_pnet_rcu(possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference(pnet->net); #else return &init_net; #endif } /* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) #define for_each_net_continue_reverse(VAR) \ list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list) #define for_each_net_rcu(VAR) \ list_for_each_entry_rcu(VAR, &net_namespace_list, list) #ifdef CONFIG_NET_NS #define __net_init #define __net_exit #define __net_initdata #define __net_initconst #else #define __net_init __init #define __net_exit __ref #define __net_initdata __initdata #define __net_initconst __initconst #endif int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp); int peernet2id(const struct net *net, struct net *peer); bool peernet_has_id(const struct net *net, struct net *peer); struct net *get_net_ns_by_id(const struct net *net, int id); struct pernet_operations { struct list_head list; /* * Below methods are called without any exclusive locks. * More than one net may be constructed and destructed * in parallel on several cpus. Every pernet_operations * have to keep in mind all other pernet_operations and * to introduce a locking, if they share common resources. * * The only time they are called with exclusive lock is * from register_pernet_subsys(), unregister_pernet_subsys() * register_pernet_device() and unregister_pernet_device(). * * Exit methods using blocking RCU primitives, such as * synchronize_rcu(), should be implemented via exit_batch. * Then, destruction of a group of net requires single * synchronize_rcu() related to these pernet_operations, * instead of separate synchronize_rcu() for every net. * Please, avoid synchronize_rcu() at all, where it's possible. * * Note that a combination of pre_exit() and exit() can * be used, since a synchronize_rcu() is guaranteed between * the calls. */ int (*init)(struct net *net); void (*pre_exit)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); unsigned int *id; size_t size; }; /* * Use these carefully. If you implement a network device and it * needs per network namespace operations use device pernet operations, * otherwise use pernet subsys operations. * * Network interfaces need to be removed from a dying netns _before_ * subsys notifiers can be called, as most of the network code cleanup * (which is done from subsys notifiers) runs with the assumption that * dev_remove_pack has been called so no new packets will arrive during * and after the cleanup functions have been called. dev_remove_pack * is not per namespace so instead the guarantee of no more packets * arriving in a network namespace is provided by ensuring that all * network devices and all sockets have left the network namespace * before the cleanup methods are called. * * For the longest time the ipv4 icmp code was registered as a pernet * device which caused kernel oops, and panics during network * namespace cleanup. So please don't get this wrong. */ int register_pernet_subsys(struct pernet_operations *); void unregister_pernet_subsys(struct pernet_operations *); int register_pernet_device(struct pernet_operations *); void unregister_pernet_device(struct pernet_operations *); struct ctl_table; #define register_net_sysctl(net, path, table) \ register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table)) #ifdef CONFIG_SYSCTL int net_sysctl_init(void); struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size); void unregister_net_sysctl_table(struct ctl_table_header *header); #else static inline int net_sysctl_init(void) { return 0; } static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path, struct ctl_table *table, size_t table_size) { return NULL; } static inline void unregister_net_sysctl_table(struct ctl_table_header *header) { } #endif static inline int rt_genid_ipv4(const struct net *net) { return atomic_read(&net->ipv4.rt_genid); } #if IS_ENABLED(CONFIG_IPV6) static inline int rt_genid_ipv6(const struct net *net) { return atomic_read(&net->ipv6.fib6_sernum); } #endif static inline void rt_genid_bump_ipv4(struct net *net) { atomic_inc(&net->ipv4.rt_genid); } extern void (*__fib6_flush_trees)(struct net *net); static inline void rt_genid_bump_ipv6(struct net *net) { if (__fib6_flush_trees) __fib6_flush_trees(net); } #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) static inline struct netns_ieee802154_lowpan * net_ieee802154_lowpan(struct net *net) { return &net->ieee802154_lowpan; } #endif /* For callers who don't really care about whether it's IPv4 or IPv6 */ static inline void rt_genid_bump_all(struct net *net) { rt_genid_bump_ipv4(net); rt_genid_bump_ipv6(net); } static inline int fnhe_genid(const struct net *net) { return atomic_read(&net->fnhe_genid); } static inline void fnhe_genid_bump(struct net *net) { atomic_inc(&net->fnhe_genid); } #ifdef CONFIG_NET void net_ns_init(void); #else static inline void net_ns_init(void) {} #endif #endif /* __NET_NET_NAMESPACE_H */ |
| 317 5 3 3 3 3 9 3 2 3 3 3 3 3 3 3 3 9 78 82 78 77 9 74 66 13 12 7 1 6 11 11 6 82 5 7 3 2 1 11 10 2 10 9 8 7 7 9 8 7 7 7 20 19 18 18 16 2 4 3 2 1 4 3 5 4 3 4 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 | // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> #include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> static bool ipv6_mapped_addr_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } static void ip6_datagram_flow_key_init(struct flowi6 *fl6, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) oif = READ_ONCE(np->mcast_oif); else oif = READ_ONCE(np->ucast_oif); } fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; struct ipv6_txoptions *opt; struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; } ip6_datagram_flow_key_init(&fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) np->saddr = fl6.saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); return err; } void ip6_datagram_release_cb(struct sock *sk) { struct dst_entry *dst; if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; } rcu_read_unlock(); ip6_datagram_dst_update(sk, false); } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr, old_daddr; __be32 fl6_flowlabel = 0; __be32 old_fl6_flowlabel; __be16 old_dport; int addr_type; int err; if (usin->sin6_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); daddr = &usin->sin6_addr; if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (ipv6_only_sock(sk)) { err = -ENETUNREACH; goto out; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); if (ipv6_addr_any(&np->saddr) || ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } goto out; } if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id); } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif)); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out; } } /* save the current peer information before updating it */ old_daddr = sk->sk_v6_daddr; old_fl6_flowlabel = np->flow_label; old_dport = inet->inet_dport; sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ err = ip6_datagram_dst_update(sk, true); if (err) { /* Restore the socket peer info, to keep it consistent with * the old socket state */ sk->sk_v6_daddr = old_daddr; np->flow_label = old_fl6_flowlabel; inet->inet_dport = old_dport; goto out; } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: return err; } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); if (sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; return ip6_datagram_connect(sk, uaddr, addr_len); } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_TIME_EXCEED: case ICMPV6_DEST_UNREACH: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), icmp6_hdr(skb)->icmp6_datagram_len * 8); } } void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; serr->ee.ee_type = icmph->icmp6_type; serr->ee.ee_code = icmph->icmp6_code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - skb_network_header(skb); serr->port = port; __skb_pull(skb, payload - skb->data); if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; struct ip6_mtuinfo *mtu_info; if (!np->rxopt.bits.rxpmtu) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); mtu_info->ip6m_mtu = mtu; mtu_info->ip6m_addr.sin6_family = AF_INET6; mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); skb = xchg(&np->rxpmtu, skb); kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) { if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) return true; if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; if (!IP6CB(skb)->iif) return false; return true; } /* * Handle MSG_ERRQUEUE */ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; if (skb->protocol == htons(ETH_P_IPV6)) { const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); sin->sin6_scope_id = 0; } *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ip6_mtuinfo mtu_info; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; *addr_len = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); err = copied; out_free_skb: kfree_skb(skb); out: return err; } void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; if (is_ipv6) { src_info.ipi6_ifindex = IP6CB(skb)->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; } else { src_info.ipi6_ifindex = PKTINFO_SKB_CB(skb)->ipi_ifindex; ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } if (src_info.ipi6_ifindex >= 0) put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } } void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); if (flowinfo) put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } if (opt->lastopt && (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { /* * Silly enough, but we need to reparse in order to * report extension headers (except for HbH) * in order. * * Also note that IPV6_RECVRTHDRDSTOPTS is NOT * (and WILL NOT be) defined because * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned int len; u8 *ptr = nh + off; switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; break; } off += len; } } /* socket options in old style */ if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; __be16 _ports[2], *ports; ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { int val = opt->frag_max_size; put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { ip6_datagram_recv_common_ctl(sk, msg, skb); ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto exit_f; } if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IPV6) continue; switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: { struct net_device *dev = NULL; int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); src_idx = src_info->ipi6_ifindex; if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; } } else if (addr_type & IPV6_ADDR_LINKLOCAL) { rcu_read_unlock(); return -EINVAL; } if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); if (err) goto exit_f; break; } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { err = -EINVAL; goto exit_f; } if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } opt->opt_nflen += len; opt->hopopt = hdr; break; case IPV6_2292DSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (opt->dst1opt) { err = -EINVAL; goto exit_f; } opt->opt_flen += len; opt->dst1opt = hdr; break; case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { opt->opt_flen += len; opt->dst1opt = hdr; } else { opt->opt_nflen += len; opt->dst0opt = hdr; } break; case IPV6_2292RTHDR: case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { err = -EINVAL; goto exit_f; } break; #endif default: err = -EINVAL; goto exit_f; } len = ((rthdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } /* segments left must also match */ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { err = -EINVAL; goto exit_f; } opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; opt->opt_flen -= dsthdrlen; } break; case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; goto exit_f; } ipc6->hlimit = *(int *)CMSG_DATA(cmsg); if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } break; case IPV6_TCLASS: { int tc; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) goto exit_f; err = 0; ipc6->tclass = tc; break; } case IPV6_DONTFRAG: { int df; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; df = *(int *)CMSG_DATA(cmsg); if (df < 0 || df > 1) goto exit_f; err = 0; ipc6->dontfrag = df; break; } default: net_dbg_ratelimited("invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; goto exit_f; } } exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } |
| 3437 1 686 2369 2272 2371 2368 2375 2372 2372 2373 2371 2 2 1 2 240 239 1777 502 501 492 1768 1769 2371 2368 103 5 5 5 5 103 103 2289 2236 2279 2237 2236 2231 2227 2228 2225 2229 101 101 101 189 189 96 98 189 189 204 195 193 4 189 189 193 193 189 204 101 21 101 1 101 20 101 51 101 101 101 2227 2225 12 12 2220 2225 2228 385 1310 1312 246 142 951 2 2 2 2 2 2237 2235 2237 688 687 688 979 978 2 2 2 2 976 977 977 3922 3928 3922 903 2234 2233 2235 2236 2231 2236 2 978 976 1348 1348 1348 53 136 136 135 2088 2088 2087 2002 136 136 136 136 136 2088 2140 2135 2135 2135 1253 7 5 6 2 2 2 1 1 2 7 2250 2250 9 1976 2245 6 5 3 2231 2240 2227 67 2238 2235 2236 2 2233 2235 1 2237 243 240 239 154 239 239 239 238 239 4 239 237 237 3 243 1286 338 338 45 889 889 890 100 100 819 889 889 575 339 339 217 339 3 865 773 772 772 772 772 434 751 724 21 21 743 952 953 927 949 950 949 2230 2224 2218 2208 2207 2208 1232 1970 1944 1870 1870 1865 1864 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/bpf.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/filter.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #define CREATE_TRACE_POINTS #include <trace/events/netlink.h> #include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; void do_trace_netlink_extack(const char *msg) { trace_netlink_extack(msg); } EXPORT_SYMBOL(do_trace_netlink_extack); static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb_end_offset(skb); struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } static unsigned int netlink_tap_net_id; struct netlink_tap_net { struct list_head netlink_tap_all; struct mutex netlink_tap_lock; }; int netlink_add_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; mutex_lock(&nn->netlink_tap_lock); list_add_rcu(&nt->list, &nn->netlink_tap_all); mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; mutex_lock(&nn->netlink_tap_lock); list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static __net_init int netlink_tap_init_net(struct net *net) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); INIT_LIST_HEAD(&nn->netlink_tap_all); mutex_init(&nn->netlink_tap_lock); return 0; } static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); rcu_read_lock(); if (unlikely(!list_empty(&nn->netlink_tap_all))) __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) { if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } atomic_inc(&sk->sk_drops); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } static void netlink_sock_destruct_work(struct work_struct *work) { struct netlink_sock *nlk = container_of(work, struct netlink_sock, work); sk_free(&nlk->sk); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; /* portid can be read locklessly from netlink_getname(). */ WRITE_ONCE(nlk_sk(sk)->portid, portid); sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, struct mutex *cb_mutex, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); if (cb_mutex) { nlk->cb_mutex = cb_mutex; } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); lockdep_set_class_and_name(nlk->cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); } init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; sock_prot_inuse_add(net, &netlink_proto, 1); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; nlk->netlink_release = release; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (nlk->cb_running && nlk->cb.done) { INIT_WORK(&nlk->work, netlink_sock_destruct_work); schedule_work(&nlk->work); return; } sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ if (nlk->netlink_release) nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); /* Because struct net might disappear soon, do not keep a pointer. */ if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); /* Because of deferred_put_nlk_sk and use of work queue, * it is possible netns will be freed before this socket. */ sock_net_set(sk, &init_net); __netns_tracker_alloc(&init_net, &sk->ns_tracker, false, GFP_KERNEL); } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { /* Paired with WRITE_ONCE() in netlink_insert() */ nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfilp(struct file *filp) { struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { struct sk_buff *skb; void *data; if (size <= NLMSG_GOODSIZE || broadcast) return alloc_skb(size, GFP_KERNEL); size = SKB_DATA_ALIGN(size) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); data = vmalloc(size); if (data == NULL) return NULL; skb = __build_skb(data, size); if (skb == NULL) vfree(data); else skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } netlink_skb_set_owner_r(skb, sk); return 0; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; unsigned long flags; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { int err; if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); break; } case NETLINK_BROADCAST_ERROR: nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: nr = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (nr >= 0) assign_bit(nr, &nlk->flags, val); return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int flag; int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift, err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; } case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: flag = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); max_recvmsg_len = min_t(size_t, max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_callback *cb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) return -ENOBUFS; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) nlmsg_end(skb, nlh); } return 0; } static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 16K bytes allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); if (alloc_min_size < max_recvmsg_len) { alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(nlk->cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES /* frag_list skb's data is used for compat tasks * and the regular skb's data for normal (non-compat) tasks. * See netlink_recvmsg(). */ if (unlikely(skb_shinfo(skb)->frag_list)) { if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) goto errout_skb; } #endif if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(nlk->cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct netlink_sock *nlk; struct sock *sk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(nlk->cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; ret = control->start(cb); cb->extack = NULL; if (ret) goto error_put; } WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(nlk->cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); static size_t netlink_ack_tlv_len(struct netlink_sock *nlk, int err, const struct netlink_ext_ack *extack) { size_t tlvlen; if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); /* Following attributes are only reported as error (not warning) */ if (!err) return tlvlen; if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); if (extack->policy) tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (extack->miss_type) tlvlen += nla_total_size(sizeof(u32)); if (extack->miss_nest) tlvlen += nla_total_size(sizeof(u32)); return tlvlen; } static void netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); if (!err) return; if (extack->bad_attr && !WARN_ON((u8 *)extack->bad_attr < in_skb->data || (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (u8 *)nlh)); if (extack->policy) netlink_policy_dump_write_attr(skb, extack->policy, NLMSGERR_ATTR_POLICY); if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); if (extack->miss_nest && !WARN_ON((u8 *)extack->miss_nest < in_skb->data || (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (u8 *)nlh)); } void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; size_t tlvlen; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; tlvlen = netlink_ack_tlv_len(nlk, err, extack); if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); if (!rep) goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; errmsg->msg = *nlh; if (!(flags & NLM_F_CAPPED)) { if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); return; err_bad_put: nlmsg_free(skb); err_skb: WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) __acquires(RCU) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; iter->link = 0; netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) ); } return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__netlink { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct netlink_sock *, sk); }; DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) static int netlink_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__netlink ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk = nlk_sk((struct sock *)v); return bpf_iter_run_prog(prog, &ctx); } static int netlink_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return netlink_native_seq_show(seq, v); if (v != SEQ_START_TOKEN) return netlink_prog_seq_show(prog, &meta, v); return 0; } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)netlink_prog_seq_show(prog, &meta, v); } netlink_native_seq_stop(seq, v); } #else static int netlink_seq_show(struct seq_file *seq, void *v) { return netlink_native_seq_show(seq, v); } static void netlink_seq_stop(struct seq_file *seq, void *v) { netlink_native_seq_stop(seq, v); } #endif static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops, sizeof(struct nl_seq_iter))) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) BTF_ID_LIST(btf_netlink_sock_id) BTF_ID(struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), }; static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) err = bpf_iter_register(); if (err) goto out; #endif BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) { while (--i > 0) rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } } netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init); |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 396 396 396 5 5 5 5 5 5 2 5 2 5 2 2 2 2 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/wait.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "scm.h" /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { struct sk_buff *skb; struct sk_buff *next; spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; while (nfd--) { /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); if (sk) { struct unix_sock *u = unix_sk(sk); /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; func(u); } } } if (hit && hitlist != NULL) { __skb_unlink(skb, &x->sk_receive_queue); __skb_queue_tail(hitlist, skb); } } } spin_unlock(&x->sk_receive_queue.lock); } static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); while (!list_empty(&embryos)) { u = list_entry(embryos.next, struct unix_sock, link); scan_inflight(&u->sk, func, hitlist); list_del_init(&u->link); } } } static void dec_inflight(struct unix_sock *usk) { atomic_long_dec(&usk->inflight); } static void inc_inflight(struct unix_sock *usk) { atomic_long_inc(&usk->inflight); } static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) list_move_tail(&u->link, &gc_candidates); } static bool gc_in_progress; #define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. * Paired with the WRITE_ONCE() in unix_inflight(), * unix_notinflight() and gc_in_progress(). */ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } /* The external entry point: unix_gc() */ void unix_gc(void) { struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); /* Avoid a recursive GC. */ if (gc_in_progress) goto out; /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, true); /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external * reference. Since there are no possible receivers, all * buffers currently on the candidates' queues stay there * during the garbage collection. * * We also know that no new candidate can be added onto the * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); inflight_refs = atomic_long_read(&u->inflight); BUG_ON(inflight_refs < 1); BUG_ON(total_refs < inflight_refs); if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); } } /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * * Use a "cursor" link, to make the list traversal safe, even * though elements might be moved about. */ list_add(&cursor, &gc_candidates); while (cursor.next != &gc_candidates) { u = list_entry(cursor.next, struct unix_sock, link); /* Move cursor to after the current position. */ list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, &hitlist); /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { u = list_entry(not_cycle_list.next, struct unix_sock, link); __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); list_move_tail(&u->link, &gc_inflight_list); } spin_unlock(&unix_gc_lock); /* We need io_uring to clean its registered files, ignore all io_uring * originated skbs. It's fine as io_uring doesn't keep references to * other io_uring instances and so killing all other files in the cycle * will put all io_uring references forcing it to go through normal * release.path eventually putting registered files. */ skb_queue_walk_safe(&hitlist, skb, next_skb) { if (skb->destructor == io_uring_destruct_scm) { __skb_unlink(skb, &hitlist); skb_queue_tail(&skb->sk->sk_receive_queue, skb); } } /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to * the inflight list */ list_for_each_entry_safe(u, next, &gc_candidates, link) list_move_tail(&u->link, &gc_inflight_list); /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); wake_up(&unix_gc_wait); out: spin_unlock(&unix_gc_lock); } |
| 6 1503 1636 1566 1125 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2009-2021 Christoph Hellwig * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. * * Current conventions for printing numbers measuring specific units: * * offset: byte offset into a subcomponent of a file operation * pos: file offset, in bytes * length: length of a file operation, in bytes * ino: inode number * * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap #if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _IOMAP_TRACE_H #include <linux/tracepoint.h> struct inode; DECLARE_EVENT_CLASS(iomap_readpage_class, TP_PROTO(struct inode *inode, int nr_pages), TP_ARGS(inode, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(int, nr_pages) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), TP_printk("dev %d:%d ino 0x%llx nr_pages %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->nr_pages) ) #define DEFINE_READPAGE_EVENT(name) \ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) __field(loff_t, offset) __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->length) ) #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ { IOMAP_DELALLOC, "DELALLOC" }, \ { IOMAP_MAPPED, "MAPPED" }, \ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ { IOMAP_INLINE, "INLINE" } #define IOMAP_FLAGS_STRINGS \ { IOMAP_WRITE, "WRITE" }, \ { IOMAP_ZERO, "ZERO" }, \ { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ { IOMAP_F_DIRTY, "DIRTY" }, \ { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } #define IOMAP_DIO_STRINGS \ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " "length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) ) #define DEFINE_IOMAP_EVENT(name) \ DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); DEFINE_IOMAP_EVENT(iomap_writepage_map); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, unsigned long caller), TP_ARGS(iter, ops, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) __field(u64, length) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) ), TP_fast_assign( __entry->dev = iter->inode->i_sb->s_dev; __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, (void *)__entry->caller) ); TRACE_EVENT(iomap_dio_rw_begin, TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, unsigned int dio_flags, size_t done_before), TP_ARGS(iocb, iter, dio_flags, done_before), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(size_t, count) __field(size_t, done_before) __field(int, ki_flags) __field(unsigned int, dio_flags) __field(bool, aio) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->count = iov_iter_count(iter); __entry->done_before = done_before; __entry->ki_flags = iocb->ki_flags; __entry->dio_flags = dio_flags; __entry->aio = !is_sync_kiocb(iocb); ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __entry->count, __entry->done_before, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS), __entry->aio) ); TRACE_EVENT(iomap_dio_complete, TP_PROTO(struct kiocb *iocb, int error, ssize_t ret), TP_ARGS(iocb, error, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, isize) __field(loff_t, pos) __field(int, ki_flags) __field(bool, aio) __field(int, error) __field(ssize_t, ret) ), TP_fast_assign( __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; __entry->ino = file_inode(iocb->ki_filp)->i_ino; __entry->isize = file_inode(iocb->ki_filp)->i_size; __entry->pos = iocb->ki_pos; __entry->ki_flags = iocb->ki_flags; __entry->aio = !is_sync_kiocb(iocb); __entry->error = error; __entry->ret = ret; ), TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->pos, __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), __entry->aio, __entry->error, __entry->ret) ); #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
| 2 2164 2164 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 | // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); /* see the comment in folio_lru_refs() */ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; tier = lru_tier_from_refs(refs); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); /* * Count the following two cases as stalls: * 1. For pages accessed through page tables, hotter pages pushed out * hot pages which refaulted immediately. * 2. For pages accessed multiple times through file descriptors, * they would have been protected by sort_folio(). */ if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; rcu_read_lock(); if (lru_gen_enabled()) { bool recent = lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { rcu_read_unlock(); return false; } rcu_read_unlock(); /* * Flush stats (and potentially sleep) outside the RCU read section. * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. * * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ mapping = container_of(node->array, struct address_space, i_pages); lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __must_hold(lru_lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, workingset_shadow_shrinker); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init); |
| 147 147 147 149 6 2 7 2 5 4 149 149 148 148 148 148 148 148 148 148 148 148 148 148 147 29 148 148 148 148 6 6 2 4 4 148 148 148 147 147 147 148 148 29 23 9 29 29 29 30 30 12 150 150 150 27 27 27 27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC individual remote procedure call handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/module.h> #include <linux/circ_buf.h> #include <linux/spinlock_types.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", [RXRPC_CALL_COMPLETE] = "Complete", }; const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_SUCCEEDED] = "Complete", [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", [RXRPC_CALL_LOCAL_ERROR] = "LocError", [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; struct kmem_cache *rxrpc_call_jar; static DEFINE_SEMAPHORE(rxrpc_call_limiter, 1000); static DEFINE_SEMAPHORE(rxrpc_kernel_call_limiter, 1000); void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) { struct rxrpc_local *local = call->local; bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { spin_lock_bh(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) busy = true; if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } spin_unlock_bh(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } } static void rxrpc_call_timer_expired(struct timer_list *t) { struct rxrpc_call *call = from_timer(call, t, timer); _enter("%d", call->debug_id); if (!__rxrpc_call_is_complete(call)) { trace_rxrpc_timer_expired(call, jiffies); rxrpc_poke_call(call, rxrpc_call_poke_timer); } } void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long expire_at, unsigned long now, enum rxrpc_timer_trace why) { trace_rxrpc_timer(call, why, now); timer_reduce(&call->timer, expire_at); } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; static void rxrpc_destroy_call(struct work_struct *); /* * find an extant server call * - called in process context with IRQs enabled */ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, unsigned long user_call_ID) { struct rxrpc_call *call; struct rb_node *p; _enter("%p,%lx", rx, user_call_ID); read_lock(&rx->call_lock); p = rx->calls.rb_node; while (p) { call = rb_entry(p, struct rxrpc_call, sock_node); if (user_call_ID < call->user_call_ID) p = p->rb_left; else if (user_call_ID > call->user_call_ID) p = p->rb_right; else goto found_extant_call; } read_unlock(&rx->call_lock); _leave(" = NULL"); return NULL; found_extant_call: rxrpc_get_call(call, rxrpc_call_get_sendmsg); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, refcount_read(&call->ref)); return call; } /* * allocate a new call */ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); call = kmem_cache_zalloc(rxrpc_call_jar, gfp); if (!call) return NULL; mutex_init(&call->user_mutex); /* Prevent lockdep reporting a deadlock false positive between the afs * filesystem and sys_sendmsg() via the mmap sem. */ if (rx->sk.sk_kern_sock) lockdep_set_class(&call->user_mutex, &rxrpc_call_user_mutex_lock_class_key); timer_setup(&call->timer, rxrpc_call_timer_expired, 0); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->wait_link); INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); INIT_LIST_HEAD(&call->tx_sendmsg); INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; if (RXRPC_TX_SMSS > 2190) call->cong_cwnd = 2; else if (RXRPC_TX_SMSS > 1095) call->cong_cwnd = 3; else call->cong_cwnd = 4; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); return call; } /* * Allocate a new client call. */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; int ret; _enter(""); call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; call->dest_srx = cp->peer->srx; call->dest_srx.srx_service = cp->service_id; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) __set_bit(RXRPC_CALL_UPGRADE, &call->flags); if (cp->exclusive) __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) call->next_rx_timo = min(msecs_to_jiffies(p->timeouts.normal), 1UL); if (p->timeouts.idle) call->next_req_timo = min(msecs_to_jiffies(p->timeouts.idle), 1UL); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard * HZ; ret = rxrpc_init_client_call_security(call); if (ret < 0) { rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, ret); rxrpc_put_call(call, rxrpc_call_put_discard_error); return ERR_PTR(ret); } rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_AWAIT_CONN); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), p->user_call_ID, rxrpc_call_new_client); _leave(" = %p", call); return call; } /* * Initiate the call ack/resend/expiry timer. */ void rxrpc_start_call_timer(struct rxrpc_call *call) { unsigned long now = jiffies; unsigned long j = now + MAX_JIFFY_OFFSET; call->delay_ack_at = j; call->ack_lost_at = j; call->resend_at = j; call->ping_at = j; call->keepalive_at = j; call->expect_rx_by = j; call->expect_req_by = j; call->expect_term_by = j + call->hard_timo; call->timer.expires = now; } /* * Wait for a call slot to become available. */ static struct semaphore *rxrpc_get_call_slot(struct rxrpc_call_params *p, gfp_t gfp) { struct semaphore *limiter = &rxrpc_call_limiter; if (p->kernel) limiter = &rxrpc_kernel_call_limiter; if (p->interruptibility == RXRPC_UNINTERRUPTIBLE) { down(limiter); return limiter; } return down_interruptible(limiter) < 0 ? NULL : limiter; } /* * Release a call slot. */ static void rxrpc_put_call_slot(struct rxrpc_call *call) { struct semaphore *limiter = &rxrpc_call_limiter; if (test_bit(RXRPC_CALL_KERNEL, &call->flags)) limiter = &rxrpc_kernel_call_limiter; up(limiter); } /* * Start the process of connecting a call. We obtain a peer and a connection * bundle, but the actual association of a call with a connection is offloaded * to the I/O thread to simplify locking. */ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_local *local = call->local; int ret = -ENOMEM; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); ret = rxrpc_look_up_bundle(call, gfp); if (ret < 0) goto error; trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); spin_lock(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); spin_unlock(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; error: __set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); return ret; } /* * Set up a call for the given parameters. * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet; struct semaphore *limiter; struct rb_node *parent, **pp; int ret; _enter("%p,%lx", rx, p->user_call_ID); if (WARN_ON_ONCE(!cp->peer)) { release_sock(&rx->sk); return ERR_PTR(-EIO); } limiter = rxrpc_get_call_slot(p, gfp); if (!limiter) { release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); } call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); up(limiter); _leave(" = %ld", PTR_ERR(call)); return call; } /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. */ mutex_lock(&call->user_mutex); /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); pp = &rx->calls.rb_node; parent = NULL; while (*pp) { parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); list_add(&call->sock_link, &rx->sock_calls); write_unlock(&rx->call_lock); rxnet = call->rxnet; spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); spin_unlock(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ ret = rxrpc_connect_call(call, gfp); if (ret < 0) goto error_attached_to_socket; _leave(" = %p [new]", call); return call; /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the * same time in different threads. */ error_dup_user_ID: write_unlock(&rx->call_lock); release_sock(&rx->sk); rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EEXIST); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, rxrpc_call_see_userid_exists); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); /* We got an error, but the call is attached to the socket and is in * need of release. However, we might now race with recvmsg() when it * completion notifies the socket. Return 0 from sys_sendmsg() and * leave the error to recvmsg() to deal with. */ error_attached_to_socket: trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, rxrpc_call_see_connect_failed); rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); _leave(" = c=%08x [err]", call->debug_id); return call; } /* * Set up an incoming call. call->conn points to the connection. * This is called in BH context and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_connection *conn = call->conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); u32 chan; _enter(",%d", call->conn->debug_id); rcu_assign_pointer(call->socket, rx); call->call_id = sp->hdr.callNumber; call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); break; case RXRPC_CONN_SERVICE: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); break; default: BUG(); } rxrpc_get_call(call, rxrpc_call_get_io_thread); /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really * interested in call_counter and can cope with a disagreement with the * call pointer). */ chan = sp->hdr.cid & RXRPC_CHANNELMASK; conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; conn->channels[chan].call = call; spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); hlist_add_head(&call->error_link, &conn->peer->error_targets); spin_unlock(&conn->peer->lock); rxrpc_start_call_timer(call); _leave(""); } /* * Note the re-emergence of a call. */ void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { if (call) { int r = refcount_read(&call->ref); trace_rxrpc_call(call->debug_id, r, 0, why); } } struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; if (!call || !__refcount_inc_not_zero(&call->ref, &r)) return NULL; trace_rxrpc_call(call->debug_id, r + 1, 0, why); return call; } /* * Note the addition of a ref on a call. */ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_rxrpc_call(call->debug_id, r + 1, 0, why); } /* * Clean up the Rx skb ring. */ static void rxrpc_cleanup_ring(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_oos_queue); } /* * Detach a call from its owning socket. */ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; bool put = false, putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), call->flags, rxrpc_call_see_release); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ spin_lock(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", call, call->events, call->flags); list_del(&call->recvmsg_link); put = true; } /* list_empty() must return false in rxrpc_notify_socket() */ call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; spin_unlock(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); write_lock(&rx->call_lock); if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); putu = true; } list_del(&call->sock_link); write_unlock(&rx->call_lock); _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); if (putu) rxrpc_put_call(call, rxrpc_call_put_userid); _leave(""); } /* * release all the calls associated with a socket */ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) { struct rxrpc_call *call; _enter("%p", rx); while (!list_empty(&rx->to_be_accepted)) { call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release_tba); rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } while (!list_empty(&rx->sock_calls)) { call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_get_release_sock); rxrpc_propose_abort(call, RX_CALL_DEAD, -ECONNRESET, rxrpc_abort_call_sock_release); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put_release_sock); } _leave(""); } /* * release a call */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { struct rxrpc_net *rxnet = call->rxnet; unsigned int debug_id = call->debug_id; bool dead; int r; ASSERT(call != NULL); dead = __refcount_dec_and_test(&call->ref, &r); trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { spin_lock(&rxnet->call_lock); list_del_init(&call->link); spin_unlock(&rxnet->call_lock); } rxrpc_cleanup_call(call); } } /* * Free up the call under RCU. */ static void rxrpc_rcu_free_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); struct rxrpc_net *rxnet = READ_ONCE(call->rxnet); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); } /* * Final call destruction - but must be done in process context. */ static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); struct rxrpc_txbuf *txb; del_timer_sync(&call->timer); rxrpc_cleanup_ring(call); while ((txb = list_first_entry_or_null(&call->tx_sendmsg, struct rxrpc_txbuf, call_link))) { list_del(&txb->call_link); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } while ((txb = list_first_entry_or_null(&call->tx_buffer, struct rxrpc_txbuf, call_link))) { list_del(&txb->call_link); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); } /* * clean up a call */ void rxrpc_cleanup_call(struct rxrpc_call *call) { memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); del_timer(&call->timer); if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ schedule_work(&call->destroyer); else rxrpc_destroy_call(&call->destroyer); } /* * Make sure that all calls are gone from a network namespace. To reach this * point, any open UDP sockets in that namespace must have been closed, so any * outstanding calls cannot be doing I/O. */ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) { struct rxrpc_call *call; _enter(""); if (!list_empty(&rxnet->calls)) { spin_lock(&rxnet->call_lock); while (!list_empty(&rxnet->calls)) { call = list_entry(rxnet->calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); rxrpc_see_call(call, rxrpc_call_see_zap); list_del_init(&call->link); pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); spin_unlock(&rxnet->call_lock); cond_resched(); spin_lock(&rxnet->call_lock); } spin_unlock(&rxnet->call_lock); } atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } |
| 475 475 1 1 1 1 475 474 475 474 475 475 1 475 1 475 475 475 1 475 474 474 474 1 475 473 475 1 475 475 447 447 437 440 1 1 1 1 1 1 1 1 1 474 474 475 475 475 475 475 474 475 475 475 474 458 458 148 458 387 458 475 438 437 438 502 502 502 502 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { struct sk_buff *skb = chunk->skb; return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) goto nodata; cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(cookie + 1, init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; /* Sign the message. */ err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, cookie->signature); if (err) goto free_cookie; } return retval; free_cookie: kfree(retval); nodata: *cookie_len = 0; return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; if (!sctp_sk(ep->base.sk)->hmac) goto no_hmac; /* Check the signature. */ { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, digest); if (err) { *error = -SCTP_IERROR_NOMEM; goto fail; } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* This param has been Deprecated, send ABORT. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), ¶m); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip) { /* Skip preceding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = (u8 *)(reconf + 1); return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; } |
| 1008 2 3989 112 181 106 235 106 55 124 6418 2960 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BITMAP_H #define __LINUX_BITMAP_H #ifndef __ASSEMBLY__ #include <linux/align.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/find.h> #include <linux/limits.h> #include <linux/string.h> #include <linux/types.h> #include <linux/bitmap-str.h> struct device; /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed * here, in bitmap.h * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture * specific are in various include/asm-<arch>/bitops.h headers * and other arch/<arch> specific files. * * See lib/bitmap.c for more details. */ /** * DOC: bitmap overview * * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * The generated code is more efficient when nbits is known at * compile-time and at most BITS_PER_LONG. * * :: * * bitmap_zero(dst, nbits) *dst = 0UL * bitmap_fill(dst, nbits) *dst = ~0UL * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap * bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf * bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst * bitmap_from_arr64(dst, buf, nbits) Copy nbits from u64[] buf to dst * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start * * Note, bitmap_zero() and bitmap_fill() operate over the region of * unsigned longs, that is, bits behind bitmap till the unsigned long * boundary will be zeroed or filled as well. Consider to use * bitmap_clear() or bitmap_set() to make explicit zeroing or filling * respectively. */ /** * DOC: bitmap bitops * * Also the following operations in asm/bitops.h apply to bitmaps.:: * * set_bit(bit, addr) *addr |= bit * clear_bit(bit, addr) *addr &= ~bit * change_bit(bit, addr) *addr ^= bit * test_bit(bit, addr) Is bit set in *addr? * test_and_set_bit(bit, addr) Set bit and return old value * test_and_clear_bit(bit, addr) Clear bit and return old value * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) * Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit * find_next_and_bit(addr1, addr2, nbits, bit) * Same as find_next_bit, but in * (*addr1 & *addr2) * */ /** * DOC: declare bitmap * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. */ /* * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags); /* * lib/bitmap.c provides these functions: */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __pure __bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits); void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits); void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ static inline unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask) { return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); } void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = 0; else memset(dst, 0, len); } static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = ~0UL; else memset(dst, 0xff, len); } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); if (small_const_nbits(nbits)) *dst = *src; else memcpy(dst, src, len); } /* * Copy bitmap and clear tail bits in last word. */ static inline void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); if (nbits % BITS_PER_LONG) dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); } /* * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 * machines the order of hi and lo parts of numbers match the bitmap structure. * In both cases conversion is not needed when copying data from/to arrays of * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit * architectures are not using bitmap_copy_clear_tail(). */ #if BITS_PER_LONG == 64 void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *) (bitmap), \ (const unsigned long *) (buf), (nbits)) #define bitmap_to_arr32(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *) (buf), \ (const unsigned long *) (bitmap), (nbits)) #endif /* * On 64-bit systems bitmaps are represented as u64 arrays internally. So, * the conversion is not needed when copying data from/to arrays of u64. */ #if BITS_PER_LONG == 32 void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits); void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr64(bitmap, buf, nbits) \ bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits)) #define bitmap_to_arr64(buf, bitmap, nbits) \ bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; else __bitmap_or(dst, src1, src2, nbits); } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; else __bitmap_xor(dst, src1, src2, nbits); } static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } #ifdef __LITTLE_ENDIAN #define BITMAP_MEM_ALIGNMENT 8 #else #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long)) #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) static inline bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) return !memcmp(src1, src2, nbits / 8); return __bitmap_equal(src1, src2, nbits); } /** * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third * @src1: Pointer to bitmap 1 * @src2: Pointer to bitmap 2 will be or'ed with bitmap 1 * @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2 * @nbits: number of bits in each of these bitmaps * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ static inline bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, unsigned int nbits) { if (!small_const_nbits(nbits)) return __bitmap_or_equal(src1, src2, src3, nbits); return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } static inline bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; else return __bitmap_intersects(src1, src2, nbits); } static inline bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); else return __bitmap_subset(src1, src2, nbits); } static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); return find_first_bit(src, nbits) == nbits; } static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); return find_first_zero_bit(src, nbits) == nbits; } static __always_inline unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight(src, nbits); } static __always_inline unsigned long bitmap_weight_and(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)); return __bitmap_weight_and(src1, src2, nbits); } static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); else if (small_const_nbits(start + nbits)) *map |= GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0xff, nbits / 8); else __bitmap_set(map, start, nbits); } static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); else if (small_const_nbits(start + nbits)) *map &= ~GENMASK(start + nbits - 1, start); else if (__builtin_constant_p(start & BITMAP_MEM_MASK) && IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) && __builtin_constant_p(nbits & BITMAP_MEM_MASK) && IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT)) memset((char *)map + start / 8, 0, nbits / 8); else __bitmap_clear(map, start, nbits); } static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; else __bitmap_shift_right(dst, src, shift, nbits); } static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); else __bitmap_shift_left(dst, src, shift, nbits); } static inline void bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*old & ~(*mask)) | (*new & *mask); else __bitmap_replace(dst, old, new, mask, nbits); } static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) { *rs = find_next_bit(bitmap, end, *rs); *re = find_next_zero_bit(bitmap, end, *rs + 1); } /** * bitmap_release_region - release allocated bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to release * @order: region size (log base 2 of number of bits) to release * * This is the complement to __bitmap_find_free_region() and releases * the found region (by clearing it in the bitmap). */ static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { bitmap_clear(bitmap, pos, BIT(order)); } /** * bitmap_allocate_region - allocate bitmap region * @bitmap: array of unsigned longs corresponding to the bitmap * @pos: beginning of bit region to allocate * @order: region size (log base 2 of number of bits) to allocate * * Allocate (set bits in) a specified region of a bitmap. * * Returns: 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { unsigned int len = BIT(order); if (find_next_bit(bitmap, pos + len, pos) < pos + len) return -EBUSY; bitmap_set(bitmap, pos, len); return 0; } /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: array of unsigned longs corresponding to the bitmap * @bits: number of bits in the bitmap * @order: region size (log base 2 of number of bits) to find * * Find a region of free (zero) bits in a @bitmap of @bits bits and * allocate them (set them to one). Only consider regions of length * a power (@order) of two, aligned to that power of two, which * makes the search algorithm much faster. * * Returns: the bit offset in bitmap of the allocated region, * or -errno on failure. */ static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { unsigned int pos, end; /* scans bitmap by regions of size order */ for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) { if (!bitmap_allocate_region(bitmap, pos, order)) return pos; } return -ENOMEM; } /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value * * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit * integers in 32-bit environment, and 64-bit integers in 64-bit one. * * There are four combinations of endianness and length of the word in linux * ABIs: LE64, BE64, LE32 and BE32. * * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in * bitmaps and therefore don't require any special handling. * * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the * other hand is represented as an array of 32-bit words and the position of * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that * word. For example, bit #42 is located at 10th position of 2nd word. * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit * values in memory as it usually does. But for BE we need to swap hi and lo * words manually. * * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps * hi and lo words, as is expected by bitmap. */ #if __BITS_PER_LONG == 64 #define BITMAP_FROM_U64(n) (n) #else #define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \ ((unsigned long) ((u64)(n) >> 32)) #endif /** * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap * @dst: destination bitmap * * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` * to read u64 mask, we will get the wrong word. * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } /** * bitmap_get_value8 - get an 8-bit value within a memory region * @map: address to the bitmap memory region * @start: bit offset of the 8-bit value; must be a multiple of 8 * * Returns the 8-bit value located at the @start bit offset within the @src * memory region. */ static inline unsigned long bitmap_get_value8(const unsigned long *map, unsigned long start) { const size_t index = BIT_WORD(start); const unsigned long offset = start % BITS_PER_LONG; return (map[index] >> offset) & 0xFF; } /** * bitmap_set_value8 - set an 8-bit value within a memory region * @map: address to the bitmap memory region * @value: the 8-bit value; values wider than 8 bits may clobber bitmap * @start: bit offset of the 8-bit value; must be a multiple of 8 */ static inline void bitmap_set_value8(unsigned long *map, unsigned long value, unsigned long start) { const size_t index = BIT_WORD(start); const unsigned long offset = start % BITS_PER_LONG; map[index] &= ~(0xFFUL << offset); map[index] |= value << offset; } #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */ |
| 3218 3218 44 44 3218 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkg - Per (cgroup, request queue) data. * @pd: blkg_policy_data structure. */ struct ioprio_blkg { struct blkg_policy_data pd; }; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; } static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) { struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); if (!pd) return NULL; return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkg_policy_data * ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); if (!ioprio_blkg) return NULL; return &ioprio_blkg->pd; } static void ioprio_free_pd(struct blkg_policy_data *pd) { struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); kfree(ioprio_blkg); } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } #define IOPRIO_ATTRS \ { \ .name = "prio.class", \ .seq_show = ioprio_show_prio_policy, \ .write = ioprio_set_prio_policy, \ }, \ { } /* sentinel */ /* cgroup v2 attributes */ static struct cftype ioprio_files[] = { IOPRIO_ATTRS }; /* cgroup v1 attributes */ static struct cftype ioprio_legacy_files[] = { IOPRIO_ATTRS }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_legacy_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, .pd_alloc_fn = ioprio_alloc_pd, .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } void blk_ioprio_exit(struct gendisk *disk) { blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit); |
| 564 858 564 564 858 167 167 541 74 1872 1870 858 167 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2"); |
| 291 11 6 7 6 1 3 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum); |
| 43 3301 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/cgroup-defs.h - basic definitions for cgroup * * This file provides basic type and interface. Include this file directly * only if necessary to avoid cyclic dependencies. */ #ifndef _LINUX_CGROUP_DEFS_H #define _LINUX_CGROUP_DEFS_H #include <linux/limits.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/wait.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup-defs.h> #include <linux/psi_types.h> #ifdef CONFIG_CGROUPS struct cgroup; struct cgroup_root; struct cgroup_subsys; struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { #include <linux/cgroup_subsys.h> CGROUP_SUBSYS_COUNT, }; #undef SUBSYS /* bits in struct cgroup_subsys_state flags field */ enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ CSS_DYING = (1 << 4), /* css is dying */ }; /* bits in struct cgroup flags field */ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* * Clone the parent's configuration when creating a new child * cpuset cgroup. For historical reasons, this option can be * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, /* Control group has to be frozen. */ CGRP_FREEZE, /* Cgroup is frozen. */ CGRP_FROZEN, /* Control group has to be killed. */ CGRP_KILL, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ /* * Consider namespaces as delegation boundaries. If this flag is * set, controller specific interface files in a namespace root * aren't writeable from inside the namespace. */ CGRP_ROOT_NS_DELEGATE = (1 << 3), /* * Reduce latencies on dynamic cgroup modifications such as task * migrations and controller on/offs by disabling percpu operation on * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from * favordynmod. */ CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), /* * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), /* * Enable legacy local memory.events. */ CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), /* * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), /* * Enable hugetlb accounting for the memory controller. */ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can * be obtained by setting cftype->file_offset. */ struct cgroup_file { /* do not access any fields from outside cgroup core */ struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; }; /* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. * * Fields marked with "PI:" are public and immutable and may be accessed * directly without synchronization. */ struct cgroup_subsys_state { /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; /* flush target list anchored at cgrp->rstat_css_list */ struct list_head rstat_css_node; /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; unsigned int flags; /* * Monotonically increasing unique serial number which defines a * uniform order among all csses. It's guaranteed that all * ->children lists are in the ascending order of ->serial_nr and * used to allow interrupting and resuming iterations. */ u64 serial_nr; /* * Incremented by online self and children. Used to guarantee that * parents are not offlined before their children. */ atomic_t online_cnt; /* percpu_ref killing and RCU release */ struct work_struct destroy_work; struct rcu_work destroy_rwork; /* * PI: the parent css. Placed here for cache proximity to following * fields of the containing structure. */ struct cgroup_subsys_state *parent; }; /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a * list_add()/del() can bump the reference count on the entire cgroup * set for a task. */ struct css_set { /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* reference count */ refcount_t refcount; /* * For a domain cgroup, the following points to self. If threaded, * to the matching cset of the nearest domain ancestor. The * dom_cset provides access to the domain cgroup and its csses to * which domain level resource consumptions should be charged. */ struct css_set *dom_cset; /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; /* internal task count, protected by css_set_lock */ int nr_tasks; /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the * process of being migrated out or in. Protected by * css_set_lock, but, during migration, once tasks are moved to * mg_tasks, it can be read safely while holding cgroup_mutex. */ struct list_head tasks; struct list_head mg_tasks; struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; /* * On the default hierarchy, ->subsys[ssid] may point to a css * attached to an ancestor instead of the cgroup this css_set is * associated with. The following node is anchored at * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to * iterate through all css's attached to a given cgroup. */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; /* all threaded csets whose ->dom_cset points to this cset */ struct list_head threaded_csets; struct list_head threaded_csets_node; /* * List running through all cgroup groups in the same hash * slot. Protected by css_set_lock */ struct hlist_node hlist; /* * List of cgrp_cset_links pointing at cgroups referenced from this * css_set. Protected by css_set_lock. */ struct list_head cgrp_links; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ struct list_head mg_src_preload_node; struct list_head mg_dst_preload_node; struct list_head mg_node; /* * If this cset is acting as the source of migration the following * two fields are set. mg_src_cgrp and mg_dst_cgrp are * respectively the source and destination cgroups of the on-going * migration. mg_dst_cset is the destination cset the target tasks * on this cset should be migrated to. Protected by cgroup_mutex. */ struct cgroup *mg_src_cgrp; struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; /* dead and being drained, ignore for migration */ bool dead; /* For RCU-protected deletion */ struct rcu_head rcu_head; }; struct cgroup_base_stat { struct task_cputime cputime; #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif }; /* * rstat - cgroup scalable recursive statistics. Accounting is done * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the * hierarchy on reads. * * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of * O(the total number of descendants). * * This is important because there can be a lot of (draining) cgroups which * aren't active and stat may be read frequently. The combination can * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - * updated_children and updated_next - and the fields which track basic * resource statistics on top of it - bsync, bstat and last_bstat. */ struct cgroup_rstat_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. */ struct u64_stats_sync bsync; struct cgroup_base_stat bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the global counters. */ struct cgroup_base_stat last_bstat; /* * This field is used to record the cumulative per-cpu time of * the cgroup and its descendants. Currently it can be read via * eBPF/drgn etc, and we are still trying to determine how to * expose it in the cgroupfs interface. */ struct cgroup_base_stat subtree_bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. * * In addition to being more compact, singly-linked list pointing * to the cgroup makes it unnecessary for each per-cpu struct to * point back to the associated cgroup. * * Protected by per-cpu cgroup_rstat_cpu_lock. */ struct cgroup *updated_children; /* terminated by self cgroup */ struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { /* Should the cgroup and its descendants be frozen. */ bool freeze; /* Should the cgroup actually be frozen? */ int e_freeze; /* Fields below are protected by css_set_lock */ /* Number of frozen descendant cgroups */ int nr_frozen_descendants; /* * Number of tasks, which are counted as frozen: * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; }; struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; unsigned long flags; /* "unsigned long" so bitops work */ /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; /* Maximum allowed descent tree depth */ int max_depth; /* * Keep track of total numbers of visible and dying descent cgroups. * Dying cgroups are cgroups which were deleted by a user, * but are still existing because someone else is holding a reference. * max_descendants is a maximum allowed number of descent cgroups. * * nr_descendants and nr_dying_descendants are protected * by cgroup_mutex and css_set_lock. It's fine to read them holding * any of cgroup_mutex and css_set_lock; for writing both locks * should be held. */ int nr_descendants; int nr_dying_descendants; int max_descendants; /* * Each non-empty css_set associated with this cgroup contributes * one to nr_populated_csets. The counter is zero iff this cgroup * doesn't have any tasks. * * All children which have non-zero nr_populated_csets and/or * nr_populated_children of their own contribute one to either * nr_populated_domain_children or nr_populated_threaded_children * depending on their type. Each counter is zero iff all cgroups * of the type in the subtree proper don't have any tasks. */ int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; int nr_threaded_children; /* # of live threaded child cgroups */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ /* handles for "{cpu,memory,io,irq}.pressure" */ struct cgroup_file psi_files[NR_PSI_RESOURCES]; /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ u16 subtree_control; u16 subtree_ss_mask; u16 old_subtree_control; u16 old_subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_root *root; /* * List of cgrp_cset_links pointing at css_sets with tasks in this * cgroup. Protected by css_set_lock. */ struct list_head cset_links; /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with * the closest ancestor which has the subsys enabled. The * following lists all css_sets which point to this cgroup's css * for the given subsystem. */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; /* * If !threaded, self. If threaded, it points to the nearest * domain ancestor. Inside a threaded subtree, cgroups are exempt * from process granularity and no-internal-task constraint. * Domain level resource consumptions which aren't tied to a * specific task are charged to the dom_cgrp. */ struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; /* * Add padding to separate the read mostly rstat_cpu and * rstat_css_list into a different cacheline from the following * rstat_flush_next and *bstat fields which can have frequent updates. */ CACHELINE_PADDING(_pad_); /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. */ struct list_head pidlists; struct mutex pidlist_mutex; /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; /* used to schedule release agent */ struct work_struct release_agent_work; /* used to track pressure stalls */ struct psi_group *psi; /* used to store eBPF programs */ struct cgroup_bpf bpf; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif /* All ancestors including self */ struct cgroup *ancestors[]; }; /* * A cgroup_root represents the root of a cgroup hierarchy, and may be * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* A list running through the active hierarchies */ struct list_head root_list; struct rcu_head rcu; /* Must be near the top */ /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the * following field. cgrp_ancestor_storage must immediately follow. */ struct cgroup cgrp; /* must follow cgrp for cgrp->ancestors[0], see above */ struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; /* Hierarchy-specific flags */ unsigned int flags; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { /* * By convention, the name should begin with the name of the * subsystem, followed by a period. Zero length string indicates * end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; /* * The maximum length of string, excluding trailing nul, that can * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; /* CFTYPE_* flags */ unsigned int flags; /* * If non-zero, should contain the offset from the start of css to * a struct cgroup_file field. cgroup will record the handle of * the created file into it. The recorded handle can be used as * long as the containing css remains accessible. */ unsigned int file_offset; /* * Fields used for internal bookkeeping. Initialized automatically * during registration. */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* generic seq_file read interface */ int (*seq_show)(struct seq_file *sf, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); /* * write_s64() is a signed version of write_u64() */ int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. * Maximum write size is determined by ->max_write_len. Use * of_css/cft() to access the associated css and cft. */ ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif }; /* * Control Group subsystem type. * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*css_local_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; /* * If %true, the controller, on the default hierarchy, doesn't show * up in "cgroup.controllers" or "cgroup.subtree_control", is * implicitly enabled on all cgroups on the default hierarchy, and * bypasses the "no internal process" constraint. This is for * utility type controllers which is transparent to userland. * * An implicit controller can be stolen from the default hierarchy * anytime and thus must be okay with offline csses from previous * hierarchies coexisting with csses for the current one. */ bool implicit_on_dfl:1; /* * If %true, the controller, supports threaded mode on the default * hierarchy. In a threaded subtree, both process granularity and * no-internal-process constraint are ignored and a threaded * controllers should be able to handle that. * * Note that as an implicit controller is automatically enabled on * all cgroups on the default hierarchy, it should also be * threaded. implicit && !threaded is not supported. */ bool threaded:1; /* the following two fields are initialized automatically during boot */ int id; const char *name; /* optional, initialized automatically during boot if not set */ const char *legacy_name; /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; /* idr for css->id */ struct idr css_idr; /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. */ struct list_head cfts; /* * Base cftypes which are automatically registered. The two can * point to the same array. */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled * together if available. Subsystems enabled due to dependency are * not visible to userland until explicitly enabled. The following * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes * using a percpu_rw_semaphore. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); } /** * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups * @tsk: target task * * Counterpart of cgroup_threadcgroup_change_begin(). */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { percpu_up_read(&cgroup_threadgroup_rwsem); } #else /* CONFIG_CGROUPS */ #define CGROUP_SUBSYS_COUNT 0 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { might_sleep(); } static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_SOCK_CGROUP_DATA /* * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * * On legacy hierarchies, net_prio and net_cls controllers directly * set attributes on each sock which can then be tested by the network * layer. On the default hierarchy, each sock is associated with the * cgroup it was created in and the networking layer can match the * cgroup directly. */ struct sock_cgroup_data { struct cgroup *cgroup; /* v2 */ #ifdef CONFIG_CGROUP_NET_CLASSID u32 classid; /* v1 */ #endif #ifdef CONFIG_CGROUP_NET_PRIO u16 prioidx; /* v1 */ #endif }; static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_PRIO return READ_ONCE(skcd->prioidx); #else return 1; #endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); #else return 0; #endif } static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { #ifdef CONFIG_CGROUP_NET_PRIO WRITE_ONCE(skcd->prioidx, prioidx); #endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { #ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); #endif } #else /* CONFIG_SOCK_CGROUP_DATA */ struct sock_cgroup_data { }; #endif /* CONFIG_SOCK_CGROUP_DATA */ #endif /* _LINUX_CGROUP_DEFS_H */ |
| 17 13 11 10 17 9 9 1 9 8 6 5 5 3 3 2 3 37 11 11 10 10 42 42 8 42 42 42 1 42 8 8 7 7 39 39 1 39 39 42 40 40 40 40 1 1 1 1 1 42 6 6 6 6 2 2 1 1 3 2 1 1 3 3 4 3 1 3 4 2 2 1 4 10 10 9 8 1 6 1 10 47 1 20 45 52 2 50 52 52 5 47 1 46 46 45 10 35 47 52 39 16 41 21 16 41 1 1 8 8 3 31 3 31 31 35 35 34 5 31 31 34 7 2 2 7 6 3 7 29 27 24 23 29 22 21 21 20 20 20 1 10 10 10 2 1 56 17 39 12 4 12 2 3 3 2 4 3 10 1 5 12 6 4 3 1 6 6 9 1 8 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); fdput(f); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto put_map; } if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); put_prog: bpf_prog_put(prog); put_map: fdput(f); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk; int err = 0; spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } return 0; } static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct bpf_prog **pprog; int ret; ret = sock_map_prog_lookup(map, &pprog, which); if (ret) return ret; if (old) return psock_replace_prog(pprog, prog, old); psock_set_prog(pprog, prog); return 0; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; fdput(f); return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); saved_close = READ_ONCE(sk->sk_prot)->close; } else { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init); |
| 73 73 73 73 73 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | /* * Cryptographic API. * * Glue code for the SHA512 Secure Hash Algorithm assembler * implementation using supplemental SSE3 / AVX / AVX2 instructions. * * This file is based on sha512_generic.c * * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); /* * Make sure struct sha512_state begins directly with the SHA512 * 512-bit internal state, as this is what the asm functions expect. */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, sha512_xform); sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_ssse3); } static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) return crypto_register_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); return 0; } static void unregister_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) crypto_unregister_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); } asmlinkage void sha512_transform_avx(struct sha512_state *state, const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } return true; } static int sha512_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_avx); } static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_avx); } /* Add padding and return the message digest. */ static int sha512_avx_final(struct shash_desc *desc, u8 *out) { return sha512_avx_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_avx(void) { if (avx_usable()) return crypto_register_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); return 0; } static void unregister_sha512_avx(void) { if (avx_usable()) crypto_unregister_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); } asmlinkage void sha512_transform_rorx(struct sha512_state *state, const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_rorx); } static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_rorx); } /* Add padding and return the message digest. */ static int sha512_avx2_final(struct shash_desc *desc, u8 *out) { return sha512_avx2_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static bool avx2_usable(void) { if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } static int register_sha512_avx2(void) { if (avx2_usable()) return crypto_register_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); return 0; } static const struct x86_cpu_id module_cpu_ids[] = { X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); static void unregister_sha512_avx2(void) { if (avx2_usable()) crypto_unregister_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); } static int __init sha512_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) return -ENODEV; if (register_sha512_ssse3()) goto fail; if (register_sha512_avx()) { unregister_sha512_ssse3(); goto fail; } if (register_sha512_avx2()) { unregister_sha512_avx(); unregister_sha512_ssse3(); goto fail; } return 0; fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { unregister_sha512_avx2(); unregister_sha512_avx(); unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-ssse3"); MODULE_ALIAS_CRYPTO("sha512-avx"); MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-ssse3"); MODULE_ALIAS_CRYPTO("sha384-avx"); MODULE_ALIAS_CRYPTO("sha384-avx2"); |
| 1865 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. * 2000-10-29 Henner Eisen lapb_data_indication() return status. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <net/lapb.h> static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* * Free an allocated lapb control block. */ static void lapb_free_cb(struct lapb_cb *lapb) { kfree(lapb); } static __inline__ void lapb_hold(struct lapb_cb *lapb) { refcount_inc(&lapb->refcnt); } static __inline__ void lapb_put(struct lapb_cb *lapb) { if (refcount_dec_and_test(&lapb->refcnt)) lapb_free_cb(lapb); } /* * Socket removal during an interrupt is now safe. */ static void __lapb_remove_cb(struct lapb_cb *lapb) { if (lapb->node.next) { list_del(&lapb->node); lapb_put(lapb); } } /* * Add a socket to the bound sockets list. */ static void __lapb_insert_cb(struct lapb_cb *lapb) { list_add(&lapb->node, &lapb_list); lapb_hold(lapb); } static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { struct lapb_cb *lapb, *use = NULL; list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; } } if (use) lapb_hold(use); return use; } static struct lapb_cb *lapb_devtostruct(struct net_device *dev) { struct lapb_cb *rc; read_lock_bh(&lapb_list_lock); rc = __lapb_devtostruct(dev); read_unlock_bh(&lapb_list_lock); return rc; } /* * Create an empty LAPB control block. */ static struct lapb_cb *lapb_create_cb(void) { struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); if (!lapb) goto out; skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); lapb->t1timer_running = false; lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; lapb->n2 = LAPB_DEFAULT_N2; lapb->mode = LAPB_DEFAULT_MODE; lapb->window = LAPB_DEFAULT_WINDOW; lapb->state = LAPB_STATE_0; spin_lock_init(&lapb->lock); refcount_set(&lapb->refcnt, 1); out: return lapb; } int lapb_register(struct net_device *dev, const struct lapb_register_struct *callbacks) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (lapb) { lapb_put(lapb); goto out; } lapb = lapb_create_cb(); rc = LAPB_NOMEM; if (!lapb) goto out; lapb->dev = dev; lapb->callbacks = callbacks; __lapb_insert_cb(lapb); lapb_start_t1timer(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_register); int lapb_unregister(struct net_device *dev) { struct lapb_cb *lapb; int rc = LAPB_BADTOKEN; write_lock_bh(&lapb_list_lock); lapb = __lapb_devtostruct(dev); if (!lapb) goto out; lapb_put(lapb); /* Wait for other refs to "lapb" to drop */ while (refcount_read(&lapb->refcnt) > 2) usleep_range(1, 10); spin_lock_bh(&lapb->lock); lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); lapb_clear_queues(lapb); spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ del_timer_sync(&lapb->t1timer); del_timer_sync(&lapb->t2timer); __lapb_remove_cb(lapb); lapb_put(lapb); rc = LAPB_OK; out: write_unlock_bh(&lapb_list_lock); return rc; } EXPORT_SYMBOL(lapb_unregister); int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); parms->t1 = lapb->t1 / HZ; parms->t2 = lapb->t2 / HZ; parms->n2 = lapb->n2; parms->n2count = lapb->n2count; parms->state = lapb->state; parms->window = lapb->window; parms->mode = lapb->mode; if (!timer_pending(&lapb->t1timer)) parms->t1timer = 0; else parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; if (!timer_pending(&lapb->t2timer)) parms->t2timer = 0; else parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; out: return rc; } EXPORT_SYMBOL(lapb_getparms); int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) { int rc = LAPB_BADTOKEN; struct lapb_cb *lapb = lapb_devtostruct(dev); if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_INVALUE; if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) goto out_put; if (lapb->state == LAPB_STATE_0) { if (parms->mode & LAPB_EXTENDED) { if (parms->window < 1 || parms->window > 127) goto out_put; } else { if (parms->window < 1 || parms->window > 7) goto out_put; } lapb->mode = parms->mode; lapb->window = parms->window; } lapb->t1 = parms->t1 * HZ; lapb->t2 = parms->t2 * HZ; lapb->n2 = parms->n2; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_setparms); int lapb_connect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_OK; if (lapb->state == LAPB_STATE_1) goto out_put; rc = LAPB_CONNECTED; if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) goto out_put; lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev); lapb->state = LAPB_STATE_1; rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_connect_request); static int __lapb_disconnect_request(struct lapb_cb *lapb) { switch (lapb->state) { case LAPB_STATE_0: return LAPB_NOTCONNECTED; case LAPB_STATE_1: lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb->state = LAPB_STATE_0; lapb_start_t1timer(lapb); return LAPB_NOTCONNECTED; case LAPB_STATE_2: return LAPB_OK; } lapb_clear_queues(lapb); lapb->n2count = 0; lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); lapb_start_t1timer(lapb); lapb_stop_t2timer(lapb); lapb->state = LAPB_STATE_2; lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev); lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev); return LAPB_OK; } int lapb_disconnect_request(struct net_device *dev) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = __lapb_disconnect_request(lapb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_disconnect_request); int lapb_data_request(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (!lapb) goto out; spin_lock_bh(&lapb->lock); rc = LAPB_NOTCONNECTED; if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) goto out_put; skb_queue_tail(&lapb->write_queue, skb); lapb_kick(lapb); rc = LAPB_OK; out_put: spin_unlock_bh(&lapb->lock); lapb_put(lapb); out: return rc; } EXPORT_SYMBOL(lapb_data_request); int lapb_data_received(struct net_device *dev, struct sk_buff *skb) { struct lapb_cb *lapb = lapb_devtostruct(dev); int rc = LAPB_BADTOKEN; if (lapb) { spin_lock_bh(&lapb->lock); lapb_data_input(lapb, skb); spin_unlock_bh(&lapb->lock); lapb_put(lapb); rc = LAPB_OK; } return rc; } EXPORT_SYMBOL(lapb_data_received); void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_confirmation) lapb->callbacks->connect_confirmation(lapb->dev, reason); } void lapb_connect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->connect_indication) lapb->callbacks->connect_indication(lapb->dev, reason); } void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_confirmation) lapb->callbacks->disconnect_confirmation(lapb->dev, reason); } void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) { if (lapb->callbacks->disconnect_indication) lapb->callbacks->disconnect_indication(lapb->dev, reason); } int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) { if (lapb->callbacks->data_indication) return lapb->callbacks->data_indication(lapb->dev, skb); kfree_skb(skb); return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ } int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) { int used = 0; if (lapb->callbacks->data_transmit) { lapb->callbacks->data_transmit(lapb->dev, skb); used = 1; } return used; } /* Handle device status changes. */ static int lapb_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct lapb_cb *lapb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type != ARPHRD_X25) return NOTIFY_DONE; lapb = lapb_devtostruct(dev); if (!lapb) return NOTIFY_DONE; spin_lock_bh(&lapb->lock); switch (event) { case NETDEV_UP: lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name); if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } break; case NETDEV_GOING_DOWN: if (netif_carrier_ok(dev)) __lapb_disconnect_request(lapb); break; case NETDEV_DOWN: lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); break; case NETDEV_CHANGE: if (netif_carrier_ok(dev)) { lapb_dbg(0, "(%p): Carrier detected: %s\n", dev, dev->name); if (lapb->mode & LAPB_DCE) { lapb_start_t1timer(lapb); } else { if (lapb->state == LAPB_STATE_0) { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } } } else { lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name); lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state); lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb->n2count = 0; lapb_stop_t1timer(lapb); lapb_stop_t2timer(lapb); } break; } spin_unlock_bh(&lapb->lock); lapb_put(lapb); return NOTIFY_DONE; } static struct notifier_block lapb_dev_notifier = { .notifier_call = lapb_device_event, }; static int __init lapb_init(void) { return register_netdevice_notifier(&lapb_dev_notifier); } static void __exit lapb_exit(void) { WARN_ON(!list_empty(&lapb_list)); unregister_netdevice_notifier(&lapb_dev_notifier); } MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); MODULE_LICENSE("GPL"); module_init(lapb_init); module_exit(lapb_exit); |
| 399 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Because linux/module.h has tracepoints in the header, and ftrace.h * used to include this file, define_trace.h includes linux/module.h * But we do not want the module.h to override the TRACE_SYSTEM macro * variable that define_trace.h is processing, so we only set it * when module events are being processed, which would happen when * CREATE_TRACE_POINTS is defined. */ #ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module #endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H #include <linux/tracepoint.h> #ifdef CONFIG_MODULES struct module; #define show_module_flags(flags) __print_flags(flags, "", \ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ { (1UL << TAINT_OOT_MODULE), "O" }, \ { (1UL << TAINT_FORCED_MODULE), "F" }, \ { (1UL << TAINT_CRAP), "C" }, \ { (1UL << TAINT_UNSIGNED_MODULE), "E" }) TRACE_EVENT(module_load, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __field( unsigned int, taints ) __string( name, mod->name ) ), TP_fast_assign( __entry->taints = mod->taints; __assign_str(name, mod->name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) ); TRACE_EVENT(module_free, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __string( name, mod->name ) ), TP_fast_assign( __assign_str(name, mod->name); ), TP_printk("%s", __get_str(name)) ); #ifdef CONFIG_MODULE_UNLOAD /* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( int, refcnt ) __string( name, mod->name ) ), TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); __assign_str(name, mod->name); ), TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); DEFINE_EVENT(module_refcnt, module_get, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); #endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, TP_PROTO(char *name, bool wait, unsigned long ip), TP_ARGS(name, wait, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( bool, wait ) __string( name, name ) ), TP_fast_assign( __entry->ip = ip; __entry->wait = wait; __assign_str(name, name); ), TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); #endif /* CONFIG_MODULES */ #endif /* _TRACE_MODULE_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Stubs for the Network PHY library */ #include <linux/rtnetlink.h> struct kernel_hwtstamp_config; struct netlink_ext_ack; struct phy_device; #if IS_ENABLED(CONFIG_PHYLIB) extern const struct phylib_stubs *phylib_stubs; struct phylib_stubs { int (*hwtstamp_get)(struct phy_device *phydev, struct kernel_hwtstamp_config *config); int (*hwtstamp_set)(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); }; static inline int phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config) { /* phylib_register_stubs() and phylib_unregister_stubs() * also run under rtnl_lock(). */ ASSERT_RTNL(); if (!phylib_stubs) return -EOPNOTSUPP; return phylib_stubs->hwtstamp_get(phydev, config); } static inline int phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { /* phylib_register_stubs() and phylib_unregister_stubs() * also run under rtnl_lock(). */ ASSERT_RTNL(); if (!phylib_stubs) return -EOPNOTSUPP; return phylib_stubs->hwtstamp_set(phydev, config, extack); } #else static inline int phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config) { return -EOPNOTSUPP; } static inline int phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } #endif |
| 818 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib6 #if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB6_H #include <linux/in6.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) __field( u16, sport ) __field( u16, dport ) __field( u8, proto ) __field( u8, rt_type ) __array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), TP_fast_assign( struct in6_addr *in6; __entry->tb_id = table->tb6_id; __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; in6 = (struct in6_addr *)__entry->src; *in6 = flp->saddr; in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; __entry->proto = flp->flowi6_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl6_sport); __entry->dport = ntohs(flp->fl6_dport); } else { __entry->sport = 0; __entry->dport = 0; } if (res->nh && res->nh->fib_nh_dev) { strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ); } else { strcpy(__entry->name, "-"); } if (res->f6i == net->ipv6.fib6_null_entry) { in6 = (struct in6_addr *)__entry->gw; *in6 = in6addr_any; } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; *in6 = res->nh->fib_nh_gw6; } ), TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->tos, __entry->scope, __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 50 14 14 14 14 14 14 14 52 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 34 24 24 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/bond/bond_netlink.c - Netlink interface for bonding * Copyright (c) 2013 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2013 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_link.h> #include <linux/if_ether.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <net/bonding.h> #include <net/ipv6.h> static size_t bond_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_STATE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_MII_STATUS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_SLAVE_LINK_FAILURE_COUNT */ nla_total_size(MAX_ADDR_LEN) + /* IFLA_BOND_SLAVE_PERM_HWADDR */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_QUEUE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_AGGREGATOR_ID */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ 0; } static int bond_fill_slave_info(struct sk_buff *skb, const struct net_device *bond_dev, const struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev); if (nla_put_u8(skb, IFLA_BOND_SLAVE_STATE, bond_slave_state(slave))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_SLAVE_MII_STATUS, slave->link)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, slave->link_failure_count)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_SLAVE_PERM_HWADDR, slave_dev->addr_len, slave->perm_hwaddr)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_SLAVE_QUEUE_ID, slave->queue_id)) goto nla_put_failure; if (nla_put_s32(skb, IFLA_BOND_SLAVE_PRIO, slave->prio)) goto nla_put_failure; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { const struct aggregator *agg; const struct port *ad_port; ad_port = &SLAVE_AD_INFO(slave)->port; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) { if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, agg->aggregator_identifier)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, ad_port->actor_oper_port_state)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) goto nla_put_failure; } } return 0; nla_put_failure: return -EMSGSIZE; } /* Limit the max delay range to 300s */ static const struct netlink_range_validation delay_range = { .max = 300000, }; static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_MODE] = { .type = NLA_U8 }, [IFLA_BOND_ACTIVE_SLAVE] = { .type = NLA_U32 }, [IFLA_BOND_MIIMON] = { .type = NLA_U32 }, [IFLA_BOND_UPDELAY] = { .type = NLA_U32 }, [IFLA_BOND_DOWNDELAY] = { .type = NLA_U32 }, [IFLA_BOND_USE_CARRIER] = { .type = NLA_U8 }, [IFLA_BOND_ARP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BOND_ARP_IP_TARGET] = { .type = NLA_NESTED }, [IFLA_BOND_ARP_VALIDATE] = { .type = NLA_U32 }, [IFLA_BOND_ARP_ALL_TARGETS] = { .type = NLA_U32 }, [IFLA_BOND_PRIMARY] = { .type = NLA_U32 }, [IFLA_BOND_PRIMARY_RESELECT] = { .type = NLA_U8 }, [IFLA_BOND_FAIL_OVER_MAC] = { .type = NLA_U8 }, [IFLA_BOND_XMIT_HASH_POLICY] = { .type = NLA_U8 }, [IFLA_BOND_RESEND_IGMP] = { .type = NLA_U32 }, [IFLA_BOND_NUM_PEER_NOTIF] = { .type = NLA_U8 }, [IFLA_BOND_ALL_SLAVES_ACTIVE] = { .type = NLA_U8 }, [IFLA_BOND_MIN_LINKS] = { .type = NLA_U32 }, [IFLA_BOND_LP_INTERVAL] = { .type = NLA_U32 }, [IFLA_BOND_PACKETS_PER_SLAVE] = { .type = NLA_U32 }, [IFLA_BOND_AD_LACP_ACTIVE] = { .type = NLA_U8 }, [IFLA_BOND_AD_LACP_RATE] = { .type = NLA_U8 }, [IFLA_BOND_AD_SELECT] = { .type = NLA_U8 }, [IFLA_BOND_AD_INFO] = { .type = NLA_NESTED }, [IFLA_BOND_AD_ACTOR_SYS_PRIO] = { .type = NLA_U16 }, [IFLA_BOND_AD_USER_PORT_KEY] = { .type = NLA_U16 }, [IFLA_BOND_AD_ACTOR_SYSTEM] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BOND_TLB_DYNAMIC_LB] = { .type = NLA_U8 }, [IFLA_BOND_PEER_NOTIF_DELAY] = NLA_POLICY_FULL_RANGE(NLA_U32, &delay_range), [IFLA_BOND_MISSED_MAX] = { .type = NLA_U8 }, [IFLA_BOND_NS_IP6_TARGET] = { .type = NLA_NESTED }, }; static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static int bond_slave_changelink(struct net_device *bond_dev, struct net_device *slave_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); struct bond_opt_value newval; int err; if (!data) return 0; if (data[IFLA_BOND_SLAVE_QUEUE_ID]) { u16 queue_id = nla_get_u16(data[IFLA_BOND_SLAVE_QUEUE_ID]); char queue_id_str[IFNAMSIZ + 7]; /* queue_id option setting expects slave_name:queue_id */ snprintf(queue_id_str, sizeof(queue_id_str), "%s:%u\n", slave_dev->name, queue_id); bond_opt_initstr(&newval, queue_id_str); err = __bond_opt_set(bond, BOND_OPT_QUEUE_ID, &newval, data[IFLA_BOND_SLAVE_QUEUE_ID], extack); if (err) return err; } if (data[IFLA_BOND_SLAVE_PRIO]) { int prio = nla_get_s32(data[IFLA_BOND_SLAVE_PRIO]); bond_opt_slave_initval(&newval, &slave_dev, prio); err = __bond_opt_set(bond, BOND_OPT_PRIO, &newval, data[IFLA_BOND_SLAVE_PRIO], extack); if (err) return err; } return 0; } static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); struct bond_opt_value newval; int miimon = 0; int err; if (!data) return 0; if (data[IFLA_BOND_MODE]) { int mode = nla_get_u8(data[IFLA_BOND_MODE]); bond_opt_initval(&newval, mode); err = __bond_opt_set(bond, BOND_OPT_MODE, &newval, data[IFLA_BOND_MODE], extack); if (err) return err; } if (data[IFLA_BOND_ACTIVE_SLAVE]) { int ifindex = nla_get_u32(data[IFLA_BOND_ACTIVE_SLAVE]); struct net_device *slave_dev; char *active_slave = ""; if (ifindex != 0) { slave_dev = __dev_get_by_index(dev_net(bond_dev), ifindex); if (!slave_dev) return -ENODEV; active_slave = slave_dev->name; } bond_opt_initstr(&newval, active_slave); err = __bond_opt_set(bond, BOND_OPT_ACTIVE_SLAVE, &newval, data[IFLA_BOND_ACTIVE_SLAVE], extack); if (err) return err; } if (data[IFLA_BOND_MIIMON]) { miimon = nla_get_u32(data[IFLA_BOND_MIIMON]); bond_opt_initval(&newval, miimon); err = __bond_opt_set(bond, BOND_OPT_MIIMON, &newval, data[IFLA_BOND_MIIMON], extack); if (err) return err; } if (data[IFLA_BOND_UPDELAY]) { int updelay = nla_get_u32(data[IFLA_BOND_UPDELAY]); bond_opt_initval(&newval, updelay); err = __bond_opt_set(bond, BOND_OPT_UPDELAY, &newval, data[IFLA_BOND_UPDELAY], extack); if (err) return err; } if (data[IFLA_BOND_DOWNDELAY]) { int downdelay = nla_get_u32(data[IFLA_BOND_DOWNDELAY]); bond_opt_initval(&newval, downdelay); err = __bond_opt_set(bond, BOND_OPT_DOWNDELAY, &newval, data[IFLA_BOND_DOWNDELAY], extack); if (err) return err; } if (data[IFLA_BOND_PEER_NOTIF_DELAY]) { int delay = nla_get_u32(data[IFLA_BOND_PEER_NOTIF_DELAY]); bond_opt_initval(&newval, delay); err = __bond_opt_set(bond, BOND_OPT_PEER_NOTIF_DELAY, &newval, data[IFLA_BOND_PEER_NOTIF_DELAY], extack); if (err) return err; } if (data[IFLA_BOND_USE_CARRIER]) { int use_carrier = nla_get_u8(data[IFLA_BOND_USE_CARRIER]); bond_opt_initval(&newval, use_carrier); err = __bond_opt_set(bond, BOND_OPT_USE_CARRIER, &newval, data[IFLA_BOND_USE_CARRIER], extack); if (err) return err; } if (data[IFLA_BOND_ARP_INTERVAL]) { int arp_interval = nla_get_u32(data[IFLA_BOND_ARP_INTERVAL]); if (arp_interval && miimon) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL], "ARP monitoring cannot be used with MII monitoring"); return -EINVAL; } bond_opt_initval(&newval, arp_interval); err = __bond_opt_set(bond, BOND_OPT_ARP_INTERVAL, &newval, data[IFLA_BOND_ARP_INTERVAL], extack); if (err) return err; } if (data[IFLA_BOND_ARP_IP_TARGET]) { struct nlattr *attr; int i = 0, rem; bond_option_arp_ip_targets_clear(bond); nla_for_each_nested(attr, data[IFLA_BOND_ARP_IP_TARGET], rem) { __be32 target; if (nla_len(attr) < sizeof(target)) return -EINVAL; target = nla_get_be32(attr); bond_opt_initval(&newval, (__force u64)target); err = __bond_opt_set(bond, BOND_OPT_ARP_TARGETS, &newval, data[IFLA_BOND_ARP_IP_TARGET], extack); if (err) break; i++; } if (i == 0 && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last arp target with arp_interval on\n"); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BOND_NS_IP6_TARGET]) { struct nlattr *attr; int i = 0, rem; bond_option_ns_ip6_targets_clear(bond); nla_for_each_nested(attr, data[IFLA_BOND_NS_IP6_TARGET], rem) { struct in6_addr addr6; if (nla_len(attr) < sizeof(addr6)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address"); return -EINVAL; } addr6 = nla_get_in6_addr(attr); bond_opt_initextra(&newval, &addr6, sizeof(addr6)); err = __bond_opt_set(bond, BOND_OPT_NS_TARGETS, &newval, data[IFLA_BOND_NS_IP6_TARGET], extack); if (err) break; i++; } if (i == 0 && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last ns target with arp_interval on\n"); if (err) return err; } #endif if (data[IFLA_BOND_ARP_VALIDATE]) { int arp_validate = nla_get_u32(data[IFLA_BOND_ARP_VALIDATE]); if (arp_validate && miimon) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_BOND_ARP_INTERVAL], "ARP validating cannot be used with MII monitoring"); return -EINVAL; } bond_opt_initval(&newval, arp_validate); err = __bond_opt_set(bond, BOND_OPT_ARP_VALIDATE, &newval, data[IFLA_BOND_ARP_VALIDATE], extack); if (err) return err; } if (data[IFLA_BOND_ARP_ALL_TARGETS]) { int arp_all_targets = nla_get_u32(data[IFLA_BOND_ARP_ALL_TARGETS]); bond_opt_initval(&newval, arp_all_targets); err = __bond_opt_set(bond, BOND_OPT_ARP_ALL_TARGETS, &newval, data[IFLA_BOND_ARP_ALL_TARGETS], extack); if (err) return err; } if (data[IFLA_BOND_PRIMARY]) { int ifindex = nla_get_u32(data[IFLA_BOND_PRIMARY]); struct net_device *dev; char *primary = ""; dev = __dev_get_by_index(dev_net(bond_dev), ifindex); if (dev) primary = dev->name; bond_opt_initstr(&newval, primary); err = __bond_opt_set(bond, BOND_OPT_PRIMARY, &newval, data[IFLA_BOND_PRIMARY], extack); if (err) return err; } if (data[IFLA_BOND_PRIMARY_RESELECT]) { int primary_reselect = nla_get_u8(data[IFLA_BOND_PRIMARY_RESELECT]); bond_opt_initval(&newval, primary_reselect); err = __bond_opt_set(bond, BOND_OPT_PRIMARY_RESELECT, &newval, data[IFLA_BOND_PRIMARY_RESELECT], extack); if (err) return err; } if (data[IFLA_BOND_FAIL_OVER_MAC]) { int fail_over_mac = nla_get_u8(data[IFLA_BOND_FAIL_OVER_MAC]); bond_opt_initval(&newval, fail_over_mac); err = __bond_opt_set(bond, BOND_OPT_FAIL_OVER_MAC, &newval, data[IFLA_BOND_FAIL_OVER_MAC], extack); if (err) return err; } if (data[IFLA_BOND_XMIT_HASH_POLICY]) { int xmit_hash_policy = nla_get_u8(data[IFLA_BOND_XMIT_HASH_POLICY]); bond_opt_initval(&newval, xmit_hash_policy); err = __bond_opt_set(bond, BOND_OPT_XMIT_HASH, &newval, data[IFLA_BOND_XMIT_HASH_POLICY], extack); if (err) return err; } if (data[IFLA_BOND_RESEND_IGMP]) { int resend_igmp = nla_get_u32(data[IFLA_BOND_RESEND_IGMP]); bond_opt_initval(&newval, resend_igmp); err = __bond_opt_set(bond, BOND_OPT_RESEND_IGMP, &newval, data[IFLA_BOND_RESEND_IGMP], extack); if (err) return err; } if (data[IFLA_BOND_NUM_PEER_NOTIF]) { int num_peer_notif = nla_get_u8(data[IFLA_BOND_NUM_PEER_NOTIF]); bond_opt_initval(&newval, num_peer_notif); err = __bond_opt_set(bond, BOND_OPT_NUM_PEER_NOTIF, &newval, data[IFLA_BOND_NUM_PEER_NOTIF], extack); if (err) return err; } if (data[IFLA_BOND_ALL_SLAVES_ACTIVE]) { int all_slaves_active = nla_get_u8(data[IFLA_BOND_ALL_SLAVES_ACTIVE]); bond_opt_initval(&newval, all_slaves_active); err = __bond_opt_set(bond, BOND_OPT_ALL_SLAVES_ACTIVE, &newval, data[IFLA_BOND_ALL_SLAVES_ACTIVE], extack); if (err) return err; } if (data[IFLA_BOND_MIN_LINKS]) { int min_links = nla_get_u32(data[IFLA_BOND_MIN_LINKS]); bond_opt_initval(&newval, min_links); err = __bond_opt_set(bond, BOND_OPT_MINLINKS, &newval, data[IFLA_BOND_MIN_LINKS], extack); if (err) return err; } if (data[IFLA_BOND_LP_INTERVAL]) { int lp_interval = nla_get_u32(data[IFLA_BOND_LP_INTERVAL]); bond_opt_initval(&newval, lp_interval); err = __bond_opt_set(bond, BOND_OPT_LP_INTERVAL, &newval, data[IFLA_BOND_LP_INTERVAL], extack); if (err) return err; } if (data[IFLA_BOND_PACKETS_PER_SLAVE]) { int packets_per_slave = nla_get_u32(data[IFLA_BOND_PACKETS_PER_SLAVE]); bond_opt_initval(&newval, packets_per_slave); err = __bond_opt_set(bond, BOND_OPT_PACKETS_PER_SLAVE, &newval, data[IFLA_BOND_PACKETS_PER_SLAVE], extack); if (err) return err; } if (data[IFLA_BOND_AD_LACP_ACTIVE]) { int lacp_active = nla_get_u8(data[IFLA_BOND_AD_LACP_ACTIVE]); bond_opt_initval(&newval, lacp_active); err = __bond_opt_set(bond, BOND_OPT_LACP_ACTIVE, &newval, data[IFLA_BOND_AD_LACP_ACTIVE], extack); if (err) return err; } if (data[IFLA_BOND_AD_LACP_RATE]) { int lacp_rate = nla_get_u8(data[IFLA_BOND_AD_LACP_RATE]); bond_opt_initval(&newval, lacp_rate); err = __bond_opt_set(bond, BOND_OPT_LACP_RATE, &newval, data[IFLA_BOND_AD_LACP_RATE], extack); if (err) return err; } if (data[IFLA_BOND_AD_SELECT]) { int ad_select = nla_get_u8(data[IFLA_BOND_AD_SELECT]); bond_opt_initval(&newval, ad_select); err = __bond_opt_set(bond, BOND_OPT_AD_SELECT, &newval, data[IFLA_BOND_AD_SELECT], extack); if (err) return err; } if (data[IFLA_BOND_AD_ACTOR_SYS_PRIO]) { int actor_sys_prio = nla_get_u16(data[IFLA_BOND_AD_ACTOR_SYS_PRIO]); bond_opt_initval(&newval, actor_sys_prio); err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYS_PRIO, &newval, data[IFLA_BOND_AD_ACTOR_SYS_PRIO], extack); if (err) return err; } if (data[IFLA_BOND_AD_USER_PORT_KEY]) { int port_key = nla_get_u16(data[IFLA_BOND_AD_USER_PORT_KEY]); bond_opt_initval(&newval, port_key); err = __bond_opt_set(bond, BOND_OPT_AD_USER_PORT_KEY, &newval, data[IFLA_BOND_AD_USER_PORT_KEY], extack); if (err) return err; } if (data[IFLA_BOND_AD_ACTOR_SYSTEM]) { if (nla_len(data[IFLA_BOND_AD_ACTOR_SYSTEM]) != ETH_ALEN) return -EINVAL; bond_opt_initval(&newval, nla_get_u64(data[IFLA_BOND_AD_ACTOR_SYSTEM])); err = __bond_opt_set(bond, BOND_OPT_AD_ACTOR_SYSTEM, &newval, data[IFLA_BOND_AD_ACTOR_SYSTEM], extack); if (err) return err; } if (data[IFLA_BOND_TLB_DYNAMIC_LB]) { int dynamic_lb = nla_get_u8(data[IFLA_BOND_TLB_DYNAMIC_LB]); bond_opt_initval(&newval, dynamic_lb); err = __bond_opt_set(bond, BOND_OPT_TLB_DYNAMIC_LB, &newval, data[IFLA_BOND_TLB_DYNAMIC_LB], extack); if (err) return err; } if (data[IFLA_BOND_MISSED_MAX]) { int missed_max = nla_get_u8(data[IFLA_BOND_MISSED_MAX]); bond_opt_initval(&newval, missed_max); err = __bond_opt_set(bond, BOND_OPT_MISSED_MAX, &newval, data[IFLA_BOND_MISSED_MAX], extack); if (err) return err; } return 0; } static int bond_newlink(struct net *src_net, struct net_device *bond_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { int err; err = bond_changelink(bond_dev, tb, data, extack); if (err < 0) return err; err = register_netdevice(bond_dev); if (!err) { struct bonding *bond = netdev_priv(bond_dev); netif_carrier_off(bond_dev); bond_work_init_all(bond); } return err; } static size_t bond_get_size(const struct net_device *bond_dev) { return nla_total_size(sizeof(u8)) + /* IFLA_BOND_MODE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ACTIVE_SLAVE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_MIIMON */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_UPDELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_DOWNDELAY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_USE_CARRIER */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_INTERVAL */ /* IFLA_BOND_ARP_IP_TARGET */ nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(u32)) * BOND_MAX_ARP_TARGETS + nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_VALIDATE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_ARP_ALL_TARGETS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PRIMARY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_PRIMARY_RESELECT */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_FAIL_OVER_MAC */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_XMIT_HASH_POLICY */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_RESEND_IGMP */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_NUM_PEER_NOTIF */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_ALL_SLAVES_ACTIVE */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_MIN_LINKS */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_LP_INTERVAL */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PACKETS_PER_SLAVE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_ACTIVE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_LACP_RATE */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_AD_SELECT */ nla_total_size(sizeof(struct nlattr)) + /* IFLA_BOND_AD_INFO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_AGGREGATOR */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_NUM_PORTS */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_ACTOR_KEY */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_INFO_PARTNER_KEY*/ nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_INFO_PARTNER_MAC*/ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_ACTOR_SYS_PRIO */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_AD_USER_PORT_KEY */ nla_total_size(ETH_ALEN) + /* IFLA_BOND_AD_ACTOR_SYSTEM */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_TLB_DYNAMIC_LB */ nla_total_size(sizeof(u32)) + /* IFLA_BOND_PEER_NOTIF_DELAY */ nla_total_size(sizeof(u8)) + /* IFLA_BOND_MISSED_MAX */ /* IFLA_BOND_NS_IP6_TARGET */ nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS + 0; } static int bond_option_active_slave_get_ifindex(struct bonding *bond) { const struct net_device *slave; int ifindex; rcu_read_lock(); slave = bond_option_active_slave_get_rcu(bond); ifindex = slave ? slave->ifindex : 0; rcu_read_unlock(); return ifindex; } static int bond_fill_info(struct sk_buff *skb, const struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); unsigned int packets_per_slave; int ifindex, i, targets_added; struct nlattr *targets; struct slave *primary; if (nla_put_u8(skb, IFLA_BOND_MODE, BOND_MODE(bond))) goto nla_put_failure; ifindex = bond_option_active_slave_get_ifindex(bond); if (ifindex && nla_put_u32(skb, IFLA_BOND_ACTIVE_SLAVE, ifindex)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_MIIMON, bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_UPDELAY, bond->params.updelay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_DOWNDELAY, bond->params.downdelay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_PEER_NOTIF_DELAY, bond->params.peer_notif_delay * bond->params.miimon)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_USE_CARRIER, bond->params.use_carrier)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_INTERVAL, bond->params.arp_interval)) goto nla_put_failure; targets = nla_nest_start_noflag(skb, IFLA_BOND_ARP_IP_TARGET); if (!targets) goto nla_put_failure; targets_added = 0; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) { if (bond->params.arp_targets[i]) { if (nla_put_be32(skb, i, bond->params.arp_targets[i])) goto nla_put_failure; targets_added = 1; } } if (targets_added) nla_nest_end(skb, targets); else nla_nest_cancel(skb, targets); if (nla_put_u32(skb, IFLA_BOND_ARP_VALIDATE, bond->params.arp_validate)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_ARP_ALL_TARGETS, bond->params.arp_all_targets)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) targets = nla_nest_start(skb, IFLA_BOND_NS_IP6_TARGET); if (!targets) goto nla_put_failure; targets_added = 0; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { if (!ipv6_addr_any(&bond->params.ns_targets[i])) { if (nla_put_in6_addr(skb, i, &bond->params.ns_targets[i])) goto nla_put_failure; targets_added = 1; } } if (targets_added) nla_nest_end(skb, targets); else nla_nest_cancel(skb, targets); #endif primary = rtnl_dereference(bond->primary_slave); if (primary && nla_put_u32(skb, IFLA_BOND_PRIMARY, primary->dev->ifindex)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_PRIMARY_RESELECT, bond->params.primary_reselect)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_FAIL_OVER_MAC, bond->params.fail_over_mac)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_XMIT_HASH_POLICY, bond->params.xmit_policy)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_RESEND_IGMP, bond->params.resend_igmp)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_NUM_PEER_NOTIF, bond->params.num_peer_notif)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_ALL_SLAVES_ACTIVE, bond->params.all_slaves_active)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_MIN_LINKS, bond->params.min_links)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_BOND_LP_INTERVAL, bond->params.lp_interval)) goto nla_put_failure; packets_per_slave = bond->params.packets_per_slave; if (nla_put_u32(skb, IFLA_BOND_PACKETS_PER_SLAVE, packets_per_slave)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_LACP_ACTIVE, bond->params.lacp_active)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_LACP_RATE, bond->params.lacp_fast)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_AD_SELECT, bond->params.ad_select)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_TLB_DYNAMIC_LB, bond->params.tlb_dynamic_lb)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_BOND_MISSED_MAX, bond->params.missed_max)) goto nla_put_failure; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info info; if (capable(CAP_NET_ADMIN)) { if (nla_put_u16(skb, IFLA_BOND_AD_ACTOR_SYS_PRIO, bond->params.ad_actor_sys_prio)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_USER_PORT_KEY, bond->params.ad_user_port_key)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_AD_ACTOR_SYSTEM, ETH_ALEN, &bond->params.ad_actor_system)) goto nla_put_failure; } if (!bond_3ad_get_active_agg_info(bond, &info)) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, IFLA_BOND_AD_INFO); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_AGGREGATOR, info.aggregator_id)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_NUM_PORTS, info.ports)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_ACTOR_KEY, info.actor_key)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BOND_AD_INFO_PARTNER_KEY, info.partner_key)) goto nla_put_failure; if (nla_put(skb, IFLA_BOND_AD_INFO_PARTNER_MAC, sizeof(info.partner_system), &info.partner_system)) goto nla_put_failure; nla_nest_end(skb, nest); } } return 0; nla_put_failure: return -EMSGSIZE; } static size_t bond_get_linkxstats_size(const struct net_device *dev, int attr) { switch (attr) { case IFLA_STATS_LINK_XSTATS: case IFLA_STATS_LINK_XSTATS_SLAVE: break; default: return 0; } return bond_3ad_stats_size() + nla_total_size(0); } static int bond_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct slave *slave = NULL; struct nlattr *nest, *nest2; struct bonding *bond; switch (attr) { case IFLA_STATS_LINK_XSTATS: bond = netdev_priv(dev); break; case IFLA_STATS_LINK_XSTATS_SLAVE: slave = bond_slave_get_rtnl(dev); if (!slave) return 0; bond = slave->bond; break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BOND); if (!nest) return -EMSGSIZE; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct bond_3ad_stats *stats; if (slave) stats = &SLAVE_AD_INFO(slave)->stats; else stats = &BOND_AD_INFO(bond).stats; nest2 = nla_nest_start_noflag(skb, BOND_XSTATS_3AD); if (!nest2) { nla_nest_end(skb, nest); return -EMSGSIZE; } if (bond_3ad_stats_fill(skb, stats)) { nla_nest_cancel(skb, nest2); nla_nest_end(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest2); } nla_nest_end(skb, nest); return 0; } struct rtnl_link_ops bond_link_ops __read_mostly = { .kind = "bond", .priv_size = sizeof(struct bonding), .setup = bond_setup, .maxtype = IFLA_BOND_MAX, .policy = bond_policy, .validate = bond_validate, .newlink = bond_newlink, .changelink = bond_changelink, .get_size = bond_get_size, .fill_info = bond_fill_info, .get_num_tx_queues = bond_get_num_tx_queues, .get_num_rx_queues = bond_get_num_tx_queues, /* Use the same number as for TX queues */ .fill_linkxstats = bond_fill_linkxstats, .get_linkxstats_size = bond_get_linkxstats_size, .slave_maxtype = IFLA_BOND_SLAVE_MAX, .slave_policy = bond_slave_policy, .slave_changelink = bond_slave_changelink, .get_slave_size = bond_get_slave_size, .fill_slave_info = bond_fill_slave_info, }; int __init bond_netlink_init(void) { return rtnl_link_register(&bond_link_ops); } void bond_netlink_fini(void) { rtnl_link_unregister(&bond_link_ops); } MODULE_ALIAS_RTNL_LINK("bond"); |
| 1523 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_RTNETLINK_H #define __LINUX_RTNETLINK_H #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/wait.h> #include <linux/refcount.h> #include <uapi/linux/rtnetlink.h> extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo) { return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo); } extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else static inline bool lockdep_rtnl_is_held(void) { return true; } #endif /* #ifdef CONFIG_PROVE_LOCKING */ /** * rcu_dereference_rtnl - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() */ #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) /** * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning * its old value * @rp: RCU pointer, whose value is returned * @p: regular pointer * * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated * pointer. The old value of @rp is returned, and @rp is set to @p */ #define rcu_replace_pointer_rtnl(rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); } static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) { return rcu_dereference(dev->ingress_queue); } struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); #define ASSERT_RTNL() \ WARN_ONCE(!rtnl_is_locked(), \ "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags); extern int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); extern void rtnl_offload_xstats_notify(struct net_device *dev); static inline int rtnl_has_listeners(const struct net *net, u32 group) { struct sock *rtnl = net->rtnl; return netlink_has_listeners(rtnl, group); } /** * rtnl_notify_needed - check if notification is needed * @net: Pointer to the net namespace * @nlflags: netlink ingress message flags * @group: rtnl group * * Based on the ingress message flags and rtnl group, returns true * if a notification is needed, false otherwise. */ static inline bool rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) { return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } #endif /* __LINUX_RTNETLINK_H */ |
| 34 33 34 32 3 1 1 31 2 2 2 1 1 1 1 9 1 2 92 88 6 6 84 32 3 2 68 92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <linux/seq_file.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> #include "nf_internals.h" static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 }; static const u_int8_t noct_valid_new[] = { [ICMPV6_MGM_QUERY - 130] = 1, [ICMPV6_MGM_REPORT - 130] = 1, [ICMPV6_MGM_REDUCTION - 130] = 1, [NDISC_ROUTER_SOLICITATION - 130] = 1, [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, [ICMPV6_MLD2_REPORT - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmpv6_get_timeouts(struct net *net) { return &nf_icmpv6_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, [ICMPV6_NI_QUERY - 128] = 1 }; if (state->pf != NFPROTO_IPV6) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); return -NF_ACCEPT; } } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } static noinline_for_stack int nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { u8 hl = ipv6_hdr(skb)->hop_limit; union nf_inet_addr outer_daddr; union { struct nd_opt_hdr nd_opt; struct rd_msg rd_msg; } tmp; const struct nd_opt_hdr *nd_opt; const struct rd_msg *rd_msg; rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); if (!rd_msg) { icmpv6_error_log(skb, state, "short redirect"); return -NF_ACCEPT; } if (rd_msg->icmph.icmp6_code != 0) return NF_ACCEPT; if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); return -NF_ACCEPT; } dataoff += sizeof(*rd_msg); /* warning: rd_msg no longer usable after this call */ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); if (!nd_opt || nd_opt->nd_opt_len == 0) { icmpv6_error_log(skb, state, "redirect without options"); return -NF_ACCEPT; } /* We could call ndisc_parse_options(), but it would need * skb_linearize() and a bit more work. */ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += 8; return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } if (icmp6h->icmp6_type == NDISC_REDIRECT) return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, sizeof(outer_daddr.ip6)); dataoff += sizeof(*icmp6h); return nf_conntrack_inet_error(tmpl, skb, dataoff, state, IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { if (!tb[CTA_PROTO_ICMPV6_TYPE]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { if (!tb[CTA_PROTO_ICMPV6_CODE]) return -EINVAL; tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { if (!tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); } return 0; } static unsigned int icmpv6_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = nf_icmpv6_pernet(net); if (!timeout) timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; } else { /* Set default ICMPv6 timeout. */ *timeout = in->timeout; } return 0; } static int icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); in->timeout = nf_ct_icmpv6_timeout; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 11 12 13 13 13 1 13 13 13 13 13 13 13 13 13 13 13 12 13 13 13 14 12 12 12 2 1 14 3 3 16 15 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <net/sock.h> #include <linux/netlink.h> #include <linux/sock_diag.h> #include <linux/netlink_diag.h> #include <linux/rhashtable.h> #include "af_netlink.h" static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->groups == NULL) return 0; return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), nlk->groups); } static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); u32 flags = 0; if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct netlink_diag_msg *rep; struct netlink_sock *nlk = nlk_sk(sk); nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->ndiag_family = AF_NETLINK; rep->ndiag_type = sk->sk_type; rep->ndiag_protocol = sk->sk_protocol; rep->ndiag_state = sk->sk_state; rep->ndiag_ino = sk_ino; rep->ndiag_portid = nlk->portid; rep->ndiag_dst_portid = nlk->dst_portid; rep->ndiag_dst_group = nlk->dst_group; sock_diag_save_cookie(sk, rep->ndiag_cookie); if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && sk_diag_dump_groups(sk, skb)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && sk_diag_put_flags(sk, skb)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; unsigned long flags; struct sock *sk; int num = 2; int ret = 0; req = nlmsg_data(cb->nlh); if (s_num > 1) goto mc_list; num--; if (!hti) { hti = kmalloc(sizeof(*hti), GFP_KERNEL); if (!hti) return -ENOMEM; cb->args[2] = (long)hti; } if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { ret = PTR_ERR(nlsk); if (ret == -EAGAIN) { ret = 0; continue; } break; } sk = (struct sock *)nlsk; if (!net_eq(sock_net(sk), net)) continue; if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; break; } } rhashtable_walk_stop(hti); if (ret) goto done; rhashtable_walk_exit(hti); num++; mc_list: read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, __sock_i_ino(sk)) < 0) { ret = 1; break; } num++; } read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; return ret; } static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; int err = 0; req = nlmsg_data(cb->nlh); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { err = __netlink_diag_dump(skb, cb, i, s_num); if (err) break; s_num = 0; } cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } return err < 0 ? err : skb->len; } static int netlink_diag_dump_done(struct netlink_callback *cb) { struct rhashtable_iter *hti = (void *)cb->args[2]; if (cb->args[0] == 1) rhashtable_walk_exit(hti); kfree(hti); return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct netlink_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return -EOPNOTSUPP; } static const struct sock_diag_handler netlink_diag_handler = { .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; static int __init netlink_diag_init(void) { return sock_diag_register(&netlink_diag_handler); } static void __exit netlink_diag_exit(void) { sock_diag_unregister(&netlink_diag_handler); } module_init(netlink_diag_init); module_exit(netlink_diag_exit); MODULE_DESCRIPTION("Netlink-based socket monitoring/diagnostic interface (sock_diag)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */); |
| 156 676 2 156 120 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LOCAL_H #define _ASM_X86_LOCAL_H #include <linux/percpu.h> #include <linux/atomic.h> #include <asm/asm.h> typedef struct { atomic_long_t a; } local_t; #define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } #define local_read(l) atomic_long_read(&(l)->a) #define local_set(l, i) atomic_long_set(&(l)->a, (i)) static inline void local_inc(local_t *l) { asm volatile(_ASM_INC "%0" : "+m" (l->a.counter)); } static inline void local_dec(local_t *l) { asm volatile(_ASM_DEC "%0" : "+m" (l->a.counter)); } static inline void local_add(long i, local_t *l) { asm volatile(_ASM_ADD "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } static inline void local_sub(long i, local_t *l) { asm volatile(_ASM_SUB "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } /** * local_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @l: pointer to type local_t * * Atomically subtracts @i from @l and returns * true if the result is zero, or false for all * other cases. */ static inline bool local_sub_and_test(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** * local_dec_and_test - decrement and test * @l: pointer to type local_t * * Atomically decrements @l by 1 and * returns true if the result is 0, or false for all other * cases. */ static inline bool local_dec_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** * local_inc_and_test - increment and test * @l: pointer to type local_t * * Atomically increments @l by 1 * and returns true if the result is zero, or false for all * other cases. */ static inline bool local_inc_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** * local_add_negative - add and test if negative * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static inline bool local_add_negative(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** * local_add_return - add and return * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns @i + @l */ static inline long local_add_return(long i, local_t *l) { long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; } static inline long local_sub_return(long i, local_t *l) { return local_add_return(-i, l); } #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) static inline long local_cmpxchg(local_t *l, long old, long new) { return cmpxchg_local(&l->a.counter, old, new); } static inline bool local_try_cmpxchg(local_t *l, long *old, long new) { return try_cmpxchg_local(&l->a.counter, (typeof(l->a.counter) *) old, new); } /* Always has a lock prefix */ #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * * Atomically adds @a to @l, if @v was not already @u. * Returns true if the addition was done. */ static __always_inline bool local_add_unless(local_t *l, long a, long u) { long c = local_read(l); do { if (unlikely(c == u)) return false; } while (!local_try_cmpxchg(l, &c, c + a)); return true; } #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. * On x86-64 these are better than the atomic variants on SMP kernels * because they dont use a lock prefix. */ #define __local_inc(l) local_inc(l) #define __local_dec(l) local_dec(l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) #endif /* _ASM_X86_LOCAL_H */ |
| 165 2071 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/idr.h * * 2002-10-18 written by Jim Houston jim.houston@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * * Small id to pointer translation service avoiding fixed sized * tables. */ #ifndef __IDR_H__ #define __IDR_H__ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> struct idr { struct radix_tree_root idr_rt; unsigned int idr_base; unsigned int idr_next; }; /* * The IDR API does not expose the tagging functionality of the radix tree * to users. Use tag 0 to track whether a node has free space below it. */ #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(name, base) { \ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ #define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** * DEFINE_IDR() - Define a statically-allocated IDR. * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator * @idr: idr handle * * The value returned is the value that will be next returned from * idr_alloc_cyclic() if it is free (otherwise the search will start from * this position). */ static inline unsigned int idr_get_cursor(const struct idr *idr) { return READ_ONCE(idr->idr_next); } /** * idr_set_cursor - Set the current position of the cyclic allocator * @idr: idr handle * @val: new position * * The next call to idr_alloc_cyclic() will return @val if it is free * (otherwise the search will start from this position). */ static inline void idr_set_cursor(struct idr *idr, unsigned int val) { WRITE_ONCE(idr->idr_next, val); } /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) * * idr_find() is able to be called locklessly, using RCU. The caller must * ensure calls to this function are made within rcu_read_lock() regions. * Other readers (lock-free or otherwise) and modifications may be running * concurrently. * * It is still required that the caller manage the synchronization and * lifetimes of the items. So if RCU lock-free lookups are used, typically * this would mean that the items have their own locks, or are amenable to * lock-free access; and that the items are freed by RCU (or only freed after * having been deleted from the idr tree *and* a synchronize_rcu() grace * period). */ #define idr_lock(idr) xa_lock(&(idr)->idr_rt) #define idr_unlock(idr) xa_unlock(&(idr)->idr_rt) #define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt) #define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt) #define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt) #define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt) #define idr_lock_irqsave(idr, flags) \ xa_lock_irqsave(&(idr)->idr_rt, flags) #define idr_unlock_irqrestore(idr, flags) \ xa_unlock_irqrestore(&(idr)->idr_rt, flags) void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t); int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id, unsigned long max, gfp_t); int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t); void *idr_remove(struct idr *, unsigned long id); void *idr_find(const struct idr *, unsigned long id); int idr_for_each(const struct idr *, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *, int *nextid); void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. * @base: The base value for the IDR. * * This variation of idr_init() creates an IDR which will allocate IDs * starting at %base. */ static inline void idr_init_base(struct idr *idr, int base) { INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); idr->idr_base = base; idr->idr_next = 0; } /** * idr_init() - Initialise an IDR. * @idr: IDR handle. * * Initialise a dynamically allocated IDR. To initialise a * statically allocated IDR, use DEFINE_IDR(). */ static inline void idr_init(struct idr *idr) { idr_init_base(idr, 0); } /** * idr_is_empty() - Are there any IDs allocated? * @idr: IDR handle. * * Return: %true if any IDs have been allocated from this IDR. */ static inline bool idr_is_empty(const struct idr *idr) { return radix_tree_empty(&idr->idr_rt) && radix_tree_tagged(&idr->idr_rt, IDR_FREE); } /** * idr_preload_end - end preload section started with idr_preload() * * Each idr_preload() should be matched with an invocation of this * function. See idr_preload() for details. */ static inline void idr_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } /** * idr_for_each_entry() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /** * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue(idr, entry, id) \ for ((entry) = idr_get_next((idr), &(id)); \ entry; \ ++id, (entry) = idr_get_next((idr), &(id))) /** * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. * After normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /* * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { struct xarray xa; }; #define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) #define IDA_INIT(name) { \ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); /** * ida_alloc() - Allocate an unused ID. * @ida: IDA handle. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc(struct ida *ida, gfp_t gfp) { return ida_alloc_range(ida, 0, ~0, gfp); } /** * ida_alloc_min() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) { return ida_alloc_range(ida, min, ~0, gfp); } /** * ida_alloc_max() - Allocate an unused ID. * @ida: IDA handle. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and @max, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) { return ida_alloc_range(ida, 0, max, gfp); } static inline void ida_init(struct ida *ida) { xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } /* * ida_simple_get() and ida_simple_remove() are deprecated. Use * ida_alloc() and ida_free() instead respectively. */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } #endif /* __IDR_H__ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ #ifndef __AF_VSOCK_H__ #define __AF_VSOCK_H__ #include <linux/kernel.h> #include <linux/workqueue.h> #include <net/sock.h> #include <uapi/linux/vm_sockets.h> #include "vsock_addr.h" #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; extern spinlock_t vsock_table_lock; #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) struct vsock_sock { /* sk must be the first member. */ struct sock sk; const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ struct list_head bound_table; struct list_head connected_table; /* Accessed without the socket lock held. This means it can never be * modified outsided of socket create or destruct. */ bool trusted; bool cached_peer_allow_dgram; /* Dgram communication allowed to * cached peer? */ u32 cached_peer; /* Context ID of last dgram destination check. */ const struct cred *owner; /* Rest are SOCK_STREAM only. */ long connect_timeout; /* Listening socket that this came from. */ struct sock *listener; /* Used for pending list and accept queue during connection handshake. * The listening socket is the head for both lists. Sockets created * for connection requests are placed in the pending list until they * are connected, at which point they are put in the accept queue list * so they can be accepted in accept(). If accept() cannot accept the * connection, it is marked as rejected so the cleanup function knows * to clean up the socket. */ struct list_head pending_links; struct list_head accept_queue; bool rejected; struct delayed_work connect_work; struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; /* Protected by lock_sock(sk) */ u64 buffer_size; u64 buffer_min_size; u64 buffer_max_size; /* Private to transport. */ void *trans; }; s64 vsock_connectible_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ struct vsock_transport_recv_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ bool notify_on_block; }; struct vsock_transport_send_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ }; /* Transport features flags */ /* Transport provides host->guest communication */ #define VSOCK_TRANSPORT_F_H2G 0x00000001 /* Transport provides guest->host communication */ #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 /* Transport provides local (loopback) communication */ #define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); /* Cancel all pending packets sent on vsock. */ int (*cancel_pkt)(struct vsock_sock *vsk); /* Connections. */ int (*connect)(struct vsock_sock *); /* DGRAM. */ int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); bool (*dgram_allow)(u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *, size_t len, int flags); ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *, size_t len); s64 (*stream_has_data)(struct vsock_sock *); s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); bool (*seqpacket_allow)(u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); int (*notify_recv_init)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_block)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, ssize_t, bool, struct vsock_transport_recv_notify_data *); int (*notify_send_init)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_block)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_enqueue)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Addressing. */ u32 (*get_local_cid)(void); /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); /* Zero-copy. */ bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ int vsock_core_register(const struct vsock_transport *t, int features); void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ /* vsock_table_lock must be held */ static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) { return !list_empty(&vsk->bound_table); } /* vsock_table_lock must be held */ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) { return !list_empty(&vsk->connected_table); } void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); /**** TAP ****/ struct vsock_tap { struct net_device *dev; struct module *module; struct list_head list; }; int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); #ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else static inline void __init vsock_bpf_build_proto(void) {} #endif static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } #endif /* __AF_VSOCK_H__ */ |
| 11846 19 47 47 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor file mediation function definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_FILE_H #define __AA_FILE_H #include <linux/spinlock.h> #include "domain.h" #include "match.h" #include "perms.h" struct aa_policydb; struct aa_profile; struct path; #define mask_mode_t(X) (X & (MAY_EXEC | MAY_WRITE | MAY_READ | MAY_APPEND)) #define AA_AUDIT_FILE_MASK (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND |\ AA_MAY_CREATE | AA_MAY_DELETE | \ AA_MAY_GETATTR | AA_MAY_SETATTR | \ AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_LOCK | \ AA_EXEC_MMAP | AA_MAY_LINK) static inline struct aa_file_ctx *file_ctx(struct file *file) { return file->f_security + apparmor_blob_sizes.lbs_file; } /* struct aa_file_ctx - the AppArmor context the file was opened in * @lock: lock to update the ctx * @label: label currently cached on the ctx * @perms: the permission the file was opened with */ struct aa_file_ctx { spinlock_t lock; struct aa_label __rcu *label; u32 allow; }; /* * The xindex is broken into 3 parts * - index - an index into either the exec name table or the variable table * - exec type - which determines how the executable name and index are used * - flags - which modify how the destination name is applied */ #define AA_X_INDEX_MASK AA_INDEX_MASK #define AA_X_TYPE_MASK 0x0c000000 #define AA_X_NONE AA_INDEX_NONE #define AA_X_NAME 0x04000000 /* use executable name px */ #define AA_X_TABLE 0x08000000 /* use a specified name ->n# */ #define AA_X_UNSAFE 0x10000000 #define AA_X_CHILD 0x20000000 #define AA_X_INHERIT 0x40000000 #define AA_X_UNCONFINED 0x80000000 /* need to make conditional which ones are being set */ struct path_cond { kuid_t uid; umode_t mode; }; #define COMBINED_PERM_MASK(X) ((X).allow | (X).audit | (X).quiet | (X).kill) int aa_audit_file(const struct cred *cred, struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error); struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond); aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms); int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond); int aa_path_link(const struct cred *subj_cred, struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry); int aa_file_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct file *file, u32 request, bool in_atomic); void aa_inherit_files(const struct cred *cred, struct files_struct *files); /** * aa_map_file_perms - map file flags to AppArmor permissions * @file: open file to map flags to AppArmor permissions * * Returns: apparmor permission set for the file */ static inline u32 aa_map_file_to_perms(struct file *file) { int flags = file->f_flags; u32 perms = 0; if (file->f_mode & FMODE_WRITE) perms |= MAY_WRITE; if (file->f_mode & FMODE_READ) perms |= MAY_READ; if ((flags & O_APPEND) && (perms & MAY_WRITE)) perms = (perms & ~MAY_WRITE) | MAY_APPEND; /* trunc implies write permission */ if (flags & O_TRUNC) perms |= MAY_WRITE; if (flags & O_CREAT) perms |= AA_MAY_CREATE; return perms; } #endif /* __AA_FILE_H */ |
| 2422 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
| 1151 3 3 3 1148 1148 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor task related definitions and mediation * * Copyright 2017 Canonical Ltd. */ #ifndef __AA_TASK_H #define __AA_TASK_H static inline struct aa_task_ctx *task_ctx(struct task_struct *task) { return task->security + apparmor_blob_sizes.lbs_task; } /* * struct aa_task_ctx - information for current task label change * @nnp: snapshot of label at time of no_new_privs * @onexec: profile to transition to on next exec (MAY BE NULL) * @previous: profile the task may return to (MAY BE NULL) * @token: magic value the task must know for returning to @previous_profile */ struct aa_task_ctx { struct aa_label *nnp; struct aa_label *onexec; struct aa_label *previous; u64 token; }; int aa_replace_current_label(struct aa_label *label); void aa_set_current_onexec(struct aa_label *label, bool stack); int aa_set_current_hat(struct aa_label *label, u64 token); int aa_restore_previous_label(u64 cookie); struct aa_label *aa_get_task_label(struct task_struct *task); /** * aa_free_task_ctx - free a task_ctx * @ctx: task_ctx to free (MAYBE NULL) */ static inline void aa_free_task_ctx(struct aa_task_ctx *ctx) { if (ctx) { aa_put_label(ctx->nnp); aa_put_label(ctx->previous); aa_put_label(ctx->onexec); } } /** * aa_dup_task_ctx - duplicate a task context, incrementing reference counts * @new: a blank task context (NOT NULL) * @old: the task context to copy (NOT NULL) */ static inline void aa_dup_task_ctx(struct aa_task_ctx *new, const struct aa_task_ctx *old) { *new = *old; aa_get_label(new->nnp); aa_get_label(new->previous); aa_get_label(new->onexec); } /** * aa_clear_task_ctx_trans - clear transition tracking info from the ctx * @ctx: task context to clear (NOT NULL) */ static inline void aa_clear_task_ctx_trans(struct aa_task_ctx *ctx) { AA_BUG(!ctx); aa_put_label(ctx->previous); aa_put_label(ctx->onexec); ctx->previous = NULL; ctx->onexec = NULL; ctx->token = 0; } #define AA_PTRACE_TRACE MAY_WRITE #define AA_PTRACE_READ MAY_READ #define AA_MAY_BE_TRACED AA_MAY_APPEND #define AA_MAY_BE_READ AA_MAY_CREATE #define PTRACE_PERM_SHIFT 2 #define AA_PTRACE_PERM_MASK (AA_PTRACE_READ | AA_PTRACE_TRACE | \ AA_MAY_BE_READ | AA_MAY_BE_TRACED) #define AA_SIGNAL_PERM_MASK (MAY_READ | MAY_WRITE) #define AA_SFS_SIG_MASK "hup int quit ill trap abrt bus fpe kill usr1 " \ "segv usr2 pipe alrm term stkflt chld cont stop stp ttin ttou urg " \ "xcpu xfsz vtalrm prof winch io pwr sys emt lost" int aa_may_ptrace(const struct cred *tracer_cred, struct aa_label *tracer, const struct cred *tracee_cred, struct aa_label *tracee, u32 request); #define AA_USERNS_CREATE 8 int aa_profile_ns_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request); #endif /* __AA_TASK_H */ |
| 10 1 2 3 4 5 6 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg); |
| 39 39 39 39 39 78 78 78 1 42 42 78 78 78 78 67 67 67 67 67 39 39 39 67 67 66 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 66 67 67 67 67 67 67 67 67 66 67 67 67 67 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 21 21 21 21 21 21 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 | // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr) && node->back_pointer) goto descend_to_node; if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); } |
| 1872 1872 94 76 70 1 1 104 1 1 103 103 101 101 12 12 94 94 94 78 76 37 76 76 70 69 4 14 14 14 4 4 4 4 4 4 39 39 13 13 39 30 30 138 138 3 1 1 1 1 127 127 4 4 4 4 4 29 29 29 105 2 16 2 2 2 1 1 1 38 38 38 38 38 38 38 38 103 2 2 3 1 1 1 1 4 4 4 4 4 4 4 6 4 4 1 1873 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/team/team.c - Network team device driver * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/ctype.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/if_vlan.h> #include <linux/if_arp.h> #include <linux/socket.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <net/genetlink.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <generated/utsrelease.h> #include <linux/if_team.h> #define DRV_NAME "team" /********** * Helpers **********/ static struct team_port *team_port_get_rtnl(const struct net_device *dev) { struct team_port *port = rtnl_dereference(dev->rx_handler_data); return netif_is_team_port(dev) ? port : NULL; } /* * Since the ability to change device address for open port device is tested in * team_port_add, this function can be called without control of return value */ static int __set_port_dev_addr(struct net_device *port_dev, const unsigned char *dev_addr) { struct sockaddr_storage addr; memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; return dev_set_mac_address(port_dev, (struct sockaddr *)&addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) { return __set_port_dev_addr(port->dev, port->orig.dev_addr); } static int team_port_set_team_dev_addr(struct team *team, struct team_port *port) { return __set_port_dev_addr(port->dev, team->dev->dev_addr); } int team_modeop_port_enter(struct team *team, struct team_port *port) { return team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_enter); void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port) { team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_change_dev_addr); static void team_lower_state_changed(struct team_port *port) { struct netdev_lag_lower_state_info info; info.link_up = port->linkup; info.tx_enabled = team_port_enabled(port); netdev_lower_state_changed(port->dev, &info); } static void team_refresh_port_linkup(struct team_port *port) { bool new_linkup = port->user.linkup_enabled ? port->user.linkup : port->state.linkup; if (port->linkup != new_linkup) { port->linkup = new_linkup; team_lower_state_changed(port); } } /******************* * Options handling *******************/ struct team_option_inst { /* One for each option instance */ struct list_head list; struct list_head tmp_list; struct team_option *option; struct team_option_inst_info info; bool changed; bool removed; }; static struct team_option *__team_find_option(struct team *team, const char *opt_name) { struct team_option *option; list_for_each_entry(option, &team->option_list, list) { if (strcmp(option->name, opt_name) == 0) return option; } return NULL; } static void __team_option_inst_del(struct team_option_inst *opt_inst) { list_del(&opt_inst->list); kfree(opt_inst); } static void __team_option_inst_del_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option == option) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add(struct team *team, struct team_option *option, struct team_port *port) { struct team_option_inst *opt_inst; unsigned int array_size; unsigned int i; array_size = option->array_size; if (!array_size) array_size = 1; /* No array but still need one instance */ for (i = 0; i < array_size; i++) { opt_inst = kmalloc(sizeof(*opt_inst), GFP_KERNEL); if (!opt_inst) return -ENOMEM; opt_inst->option = option; opt_inst->info.port = port; opt_inst->info.array_index = i; opt_inst->changed = true; opt_inst->removed = false; list_add_tail(&opt_inst->list, &team->option_inst_list); if (option->init) option->init(team, &opt_inst->info); } return 0; } static int __team_option_inst_add_option(struct team *team, struct team_option *option) { int err; if (!option->per_port) { err = __team_option_inst_add(team, option, NULL); if (err) goto inst_del_option; } return 0; inst_del_option: __team_option_inst_del_option(team, option); return err; } static void __team_option_inst_mark_removed_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->option == option) { opt_inst->changed = true; opt_inst->removed = true; } } } static void __team_option_inst_del_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option->per_port && opt_inst->info.port == port) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add_port(struct team *team, struct team_port *port) { struct team_option *option; int err; list_for_each_entry(option, &team->option_list, list) { if (!option->per_port) continue; err = __team_option_inst_add(team, option, port); if (err) goto inst_del_port; } return 0; inst_del_port: __team_option_inst_del_port(team, port); return err; } static void __team_option_inst_mark_removed_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->info.port == port) { opt_inst->changed = true; opt_inst->removed = true; } } } static int __team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int i; struct team_option **dst_opts; int err; dst_opts = kcalloc(option_count, sizeof(struct team_option *), GFP_KERNEL); if (!dst_opts) return -ENOMEM; for (i = 0; i < option_count; i++, option++) { if (__team_find_option(team, option->name)) { err = -EEXIST; goto alloc_rollback; } dst_opts[i] = kmemdup(option, sizeof(*option), GFP_KERNEL); if (!dst_opts[i]) { err = -ENOMEM; goto alloc_rollback; } } for (i = 0; i < option_count; i++) { err = __team_option_inst_add_option(team, dst_opts[i]); if (err) goto inst_rollback; list_add_tail(&dst_opts[i]->list, &team->option_list); } kfree(dst_opts); return 0; inst_rollback: for (i--; i >= 0; i--) { __team_option_inst_del_option(team, dst_opts[i]); list_del(&dst_opts[i]->list); } i = option_count; alloc_rollback: for (i--; i >= 0; i--) kfree(dst_opts[i]); kfree(dst_opts); return err; } static void __team_options_mark_removed(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) __team_option_inst_mark_removed_option(team, del_opt); } } static void __team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) { __team_option_inst_del_option(team, del_opt); list_del(&del_opt->list); kfree(del_opt); } } } static void __team_options_change_check(struct team *team); int team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int err; err = __team_options_register(team, option, option_count); if (err) return err; __team_options_change_check(team); return 0; } EXPORT_SYMBOL(team_options_register); void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { __team_options_mark_removed(team, option, option_count); __team_options_change_check(team); __team_options_unregister(team, option, option_count); } EXPORT_SYMBOL(team_options_unregister); static int team_option_get(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->getter) return -EOPNOTSUPP; opt_inst->option->getter(team, ctx); return 0; } static int team_option_set(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->setter) return -EOPNOTSUPP; return opt_inst->option->setter(team, ctx); } void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info) { struct team_option_inst *opt_inst; opt_inst = container_of(opt_inst_info, struct team_option_inst, info); opt_inst->changed = true; } EXPORT_SYMBOL(team_option_inst_set_change); void team_options_change_check(struct team *team) { __team_options_change_check(team); } EXPORT_SYMBOL(team_options_change_check); /**************** * Mode handling ****************/ static LIST_HEAD(mode_list); static DEFINE_SPINLOCK(mode_list_lock); struct team_mode_item { struct list_head list; const struct team_mode *mode; }; static struct team_mode_item *__find_mode(const char *kind) { struct team_mode_item *mitem; list_for_each_entry(mitem, &mode_list, list) { if (strcmp(mitem->mode->kind, kind) == 0) return mitem; } return NULL; } static bool is_good_mode_name(const char *name) { while (*name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') return false; name++; } return true; } int team_mode_register(const struct team_mode *mode) { int err = 0; struct team_mode_item *mitem; if (!is_good_mode_name(mode->kind) || mode->priv_size > TEAM_MODE_PRIV_SIZE) return -EINVAL; mitem = kmalloc(sizeof(*mitem), GFP_KERNEL); if (!mitem) return -ENOMEM; spin_lock(&mode_list_lock); if (__find_mode(mode->kind)) { err = -EEXIST; kfree(mitem); goto unlock; } mitem->mode = mode; list_add_tail(&mitem->list, &mode_list); unlock: spin_unlock(&mode_list_lock); return err; } EXPORT_SYMBOL(team_mode_register); void team_mode_unregister(const struct team_mode *mode) { struct team_mode_item *mitem; spin_lock(&mode_list_lock); mitem = __find_mode(mode->kind); if (mitem) { list_del_init(&mitem->list); kfree(mitem); } spin_unlock(&mode_list_lock); } EXPORT_SYMBOL(team_mode_unregister); static const struct team_mode *team_mode_get(const char *kind) { struct team_mode_item *mitem; const struct team_mode *mode = NULL; if (!try_module_get(THIS_MODULE)) return NULL; spin_lock(&mode_list_lock); mitem = __find_mode(kind); if (!mitem) { spin_unlock(&mode_list_lock); request_module("team-mode-%s", kind); spin_lock(&mode_list_lock); mitem = __find_mode(kind); } if (mitem) { mode = mitem->mode; if (!try_module_get(mode->owner)) mode = NULL; } spin_unlock(&mode_list_lock); module_put(THIS_MODULE); return mode; } static void team_mode_put(const struct team_mode *mode) { module_put(mode->owner); } static bool team_dummy_transmit(struct team *team, struct sk_buff *skb) { dev_kfree_skb_any(skb); return false; } static rx_handler_result_t team_dummy_receive(struct team *team, struct team_port *port, struct sk_buff *skb) { return RX_HANDLER_ANOTHER; } static const struct team_mode __team_no_mode = { .kind = "*NOMODE*", }; static bool team_is_mode_set(struct team *team) { return team->mode != &__team_no_mode; } static void team_set_no_mode(struct team *team) { team->user_carrier_enabled = false; team->mode = &__team_no_mode; } static void team_adjust_ops(struct team *team) { /* * To avoid checks in rx/tx skb paths, ensure here that non-null and * correct ops are always set. */ if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->transmit) team->ops.transmit = team_dummy_transmit; else team->ops.transmit = team->mode->ops->transmit; if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->receive) team->ops.receive = team_dummy_receive; else team->ops.receive = team->mode->ops->receive; } /* * We can benefit from the fact that it's ensured no port is present * at the time of mode change. Therefore no packets are in fly so there's no * need to set mode operations in any special way. */ static int __team_change_mode(struct team *team, const struct team_mode *new_mode) { /* Check if mode was previously set and do cleanup if so */ if (team_is_mode_set(team)) { void (*exit_op)(struct team *team) = team->ops.exit; /* Clear ops area so no callback is called any longer */ memset(&team->ops, 0, sizeof(struct team_mode_ops)); team_adjust_ops(team); if (exit_op) exit_op(team); team_mode_put(team->mode); team_set_no_mode(team); /* zero private data area */ memset(&team->mode_priv, 0, sizeof(struct team) - offsetof(struct team, mode_priv)); } if (!new_mode) return 0; if (new_mode->ops->init) { int err; err = new_mode->ops->init(team); if (err) return err; } team->mode = new_mode; memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops)); team_adjust_ops(team); return 0; } static int team_change_mode(struct team *team, const char *kind) { const struct team_mode *new_mode; struct net_device *dev = team->dev; int err; if (!list_empty(&team->port_list)) { netdev_err(dev, "No ports can be present during mode change\n"); return -EBUSY; } if (team_is_mode_set(team) && strcmp(team->mode->kind, kind) == 0) { netdev_err(dev, "Unable to change to the same mode the team is in\n"); return -EINVAL; } new_mode = team_mode_get(kind); if (!new_mode) { netdev_err(dev, "Mode \"%s\" not found\n", kind); return -EINVAL; } err = __team_change_mode(team, new_mode); if (err) { netdev_err(dev, "Failed to change to mode \"%s\"\n", kind); team_mode_put(new_mode); return err; } netdev_info(dev, "Mode changed to \"%s\"\n", kind); return 0; } /********************* * Peers notification *********************/ static void team_notify_peers_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, notify_peers.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->notify_peers.dw, 0); return; } val = atomic_dec_if_positive(&team->notify_peers.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->notify_peers.dw, msecs_to_jiffies(team->notify_peers.interval)); } static void team_notify_peers(struct team *team) { if (!team->notify_peers.count || !netif_running(team->dev)) return; atomic_add(team->notify_peers.count, &team->notify_peers.count_pending); schedule_delayed_work(&team->notify_peers.dw, 0); } static void team_notify_peers_init(struct team *team) { INIT_DELAYED_WORK(&team->notify_peers.dw, team_notify_peers_work); } static void team_notify_peers_fini(struct team *team) { cancel_delayed_work_sync(&team->notify_peers.dw); } /******************************* * Send multicast group rejoins *******************************/ static void team_mcast_rejoin_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, mcast_rejoin.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->mcast_rejoin.dw, 0); return; } val = atomic_dec_if_positive(&team->mcast_rejoin.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->mcast_rejoin.dw, msecs_to_jiffies(team->mcast_rejoin.interval)); } static void team_mcast_rejoin(struct team *team) { if (!team->mcast_rejoin.count || !netif_running(team->dev)) return; atomic_add(team->mcast_rejoin.count, &team->mcast_rejoin.count_pending); schedule_delayed_work(&team->mcast_rejoin.dw, 0); } static void team_mcast_rejoin_init(struct team *team) { INIT_DELAYED_WORK(&team->mcast_rejoin.dw, team_mcast_rejoin_work); } static void team_mcast_rejoin_fini(struct team *team) { cancel_delayed_work_sync(&team->mcast_rejoin.dw); } /************************ * Rx path frame handler ************************/ /* note: already called with rcu_read_lock */ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct team_port *port; struct team *team; rx_handler_result_t res; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; port = team_port_get_rcu(skb->dev); team = port->team; if (!team_port_enabled(port)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) /* link-local packets are mostly useful when stack receives them * with the link they arrive on. */ return RX_HANDLER_PASS; /* allow exact match delivery for disabled ports */ res = RX_HANDLER_EXACT; } else { res = team->ops.receive(team, port, skb); } if (res == RX_HANDLER_ANOTHER) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, skb->len); if (skb->pkt_type == PACKET_MULTICAST) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); skb->dev = team->dev; } else if (res == RX_HANDLER_EXACT) { this_cpu_inc(team->pcpu_stats->rx_nohandler); } else { this_cpu_inc(team->pcpu_stats->rx_dropped); } return res; } /************************************* * Multiqueue Tx port select override *************************************/ static int team_queue_override_init(struct team *team) { struct list_head *listarr; unsigned int queue_cnt = team->dev->num_tx_queues - 1; unsigned int i; if (!queue_cnt) return 0; listarr = kmalloc_array(queue_cnt, sizeof(struct list_head), GFP_KERNEL); if (!listarr) return -ENOMEM; team->qom_lists = listarr; for (i = 0; i < queue_cnt; i++) INIT_LIST_HEAD(listarr++); return 0; } static void team_queue_override_fini(struct team *team) { kfree(team->qom_lists); } static struct list_head *__team_get_qom_list(struct team *team, u16 queue_id) { return &team->qom_lists[queue_id - 1]; } /* * note: already called with rcu_read_lock */ static bool team_queue_override_transmit(struct team *team, struct sk_buff *skb) { struct list_head *qom_list; struct team_port *port; if (!team->queue_override_enabled || !skb->queue_mapping) return false; qom_list = __team_get_qom_list(team, skb->queue_mapping); list_for_each_entry_rcu(port, qom_list, qom_list) { if (!team_dev_queue_xmit(team, port, skb)) return true; } return false; } static void __team_queue_override_port_del(struct team *team, struct team_port *port) { if (!port->queue_id) return; list_del_rcu(&port->qom_list); } static bool team_queue_override_port_has_gt_prio_than(struct team_port *port, struct team_port *cur) { if (port->priority < cur->priority) return true; if (port->priority > cur->priority) return false; if (port->index < cur->index) return true; return false; } static void __team_queue_override_port_add(struct team *team, struct team_port *port) { struct team_port *cur; struct list_head *qom_list; struct list_head *node; if (!port->queue_id) return; qom_list = __team_get_qom_list(team, port->queue_id); node = qom_list; list_for_each_entry(cur, qom_list, qom_list) { if (team_queue_override_port_has_gt_prio_than(port, cur)) break; node = &cur->qom_list; } list_add_tail_rcu(&port->qom_list, node); } static void __team_queue_override_enabled_check(struct team *team) { struct team_port *port; bool enabled = false; list_for_each_entry(port, &team->port_list, list) { if (port->queue_id) { enabled = true; break; } } if (enabled == team->queue_override_enabled) return; netdev_dbg(team->dev, "%s queue override\n", enabled ? "Enabling" : "Disabling"); team->queue_override_enabled = enabled; } static void team_queue_override_port_prio_changed(struct team *team, struct team_port *port) { if (!port->queue_id || team_port_enabled(port)) return; __team_queue_override_port_del(team, port); __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_change_queue_id(struct team *team, struct team_port *port, u16 new_queue_id) { if (team_port_enabled(port)) { __team_queue_override_port_del(team, port); port->queue_id = new_queue_id; __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } else { port->queue_id = new_queue_id; } } static void team_queue_override_port_add(struct team *team, struct team_port *port) { __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_del(struct team *team, struct team_port *port) { __team_queue_override_port_del(team, port); __team_queue_override_enabled_check(team); } /**************** * Port handling ****************/ static bool team_port_find(const struct team *team, const struct team_port *port) { struct team_port *cur; list_for_each_entry(cur, &team->port_list, list) if (cur == port) return true; return false; } /* * Enable/disable port by adding to enabled port hashlist and setting * port->index (Might be racy so reader could see incorrect ifindex when * processing a flying packet, but that is not a problem). Write guarded * by team->lock. */ static void team_port_enable(struct team *team, struct team_port *port) { if (team_port_enabled(port)) return; port->index = team->en_port_count++; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); team_adjust_ops(team); team_queue_override_port_add(team, port); if (team->ops.port_enabled) team->ops.port_enabled(team, port); team_notify_peers(team); team_mcast_rejoin(team); team_lower_state_changed(port); } static void __reconstruct_port_hlist(struct team *team, int rm_index) { int i; struct team_port *port; for (i = rm_index + 1; i < team->en_port_count; i++) { port = team_get_port_by_index(team, i); hlist_del_rcu(&port->hlist); port->index--; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); } } static void team_port_disable(struct team *team, struct team_port *port) { if (!team_port_enabled(port)) return; if (team->ops.port_disabled) team->ops.port_disabled(team, port); hlist_del_rcu(&port->hlist); __reconstruct_port_hlist(team, port->index); port->index = -1; team->en_port_count--; team_queue_override_port_del(team, port); team_adjust_ops(team); team_lower_state_changed(port); } #define TEAM_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define TEAM_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) static void __team_compute_features(struct team *team) { struct team_port *port; netdev_features_t vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; netdev_features_t enc_features = TEAM_ENC_FEATURES; unsigned short max_hard_header_len = ETH_HLEN; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, TEAM_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, port->dev->hw_enc_features, TEAM_ENC_FEATURES); dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } rcu_read_unlock(); team->dev->vlan_features = vlan_features; team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; team->dev->hard_header_len = max_hard_header_len; team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; } static void team_compute_features(struct team *team) { __team_compute_features(team); netdev_change_features(team->dev); } static int team_port_enter(struct team *team, struct team_port *port) { int err = 0; dev_hold(team->dev); if (team->ops.port_enter) { err = team->ops.port_enter(team, port); if (err) { netdev_err(team->dev, "Device %s failed to enter team mode\n", port->dev->name); goto err_port_enter; } } return 0; err_port_enter: dev_put(team->dev); return err; } static void team_port_leave(struct team *team, struct team_port *port) { if (team->ops.port_leave) team->ops.port_leave(team, port); dev_put(team->dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static int __team_port_enable_netpoll(struct team_port *port) { struct netpoll *np; int err; np = kzalloc(sizeof(*np), GFP_KERNEL); if (!np) return -ENOMEM; err = __netpoll_setup(np, port->dev); if (err) { kfree(np); return err; } port->np = np; return err; } static int team_port_enable_netpoll(struct team_port *port) { if (!port->team->dev->npinfo) return 0; return __team_port_enable_netpoll(port); } static void team_port_disable_netpoll(struct team_port *port) { struct netpoll *np = port->np; if (!np) return; port->np = NULL; __netpoll_free(np); } #else static int team_port_enable_netpoll(struct team_port *port) { return 0; } static void team_port_disable_netpoll(struct team_port *port) { } #endif static int team_upper_dev_link(struct team *team, struct team_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; int err; lag_upper_info.tx_type = team->mode->lag_tx_type; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, &lag_upper_info, extack); if (err) return err; port->dev->priv_flags |= IFF_TEAM_PORT; return 0; } static void team_upper_dev_unlink(struct team *team, struct team_port *port) { netdev_upper_dev_unlink(port->dev, team->dev); port->dev->priv_flags &= ~IFF_TEAM_PORT; } static void __team_port_change_port_added(struct team_port *port, bool linkup); static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev); static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; int err; if (port_dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port"); netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n", portname); return -EINVAL; } if (netif_is_team_port(port_dev)) { NL_SET_ERR_MSG(extack, "Device is already a port of a team device"); netdev_err(dev, "Device %s is already a port " "of a team device\n", portname); return -EBUSY; } if (dev == port_dev) { NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); netdev_err(dev, "Cannot enslave team device to itself\n"); return -EINVAL; } if (netdev_has_upper_dev(dev, port_dev)) { NL_SET_ERR_MSG(extack, "Device is already an upper device of the team interface"); netdev_err(dev, "Device %s is already an upper device of the team interface\n", portname); return -EBUSY; } if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n", portname); return -EPERM; } err = team_dev_type_check_change(dev, port_dev); if (err) return err; if (port_dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port"); netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n", portname); return -EBUSY; } port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size, GFP_KERNEL); if (!port) return -ENOMEM; port->dev = port_dev; port->team = team; INIT_LIST_HEAD(&port->qom_list); port->orig.mtu = port_dev->mtu; err = dev_set_mtu(port_dev, dev->mtu); if (err) { netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); goto err_set_mtu; } memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); err = team_port_enter(team, port); if (err) { netdev_err(dev, "Device %s failed to enter team mode\n", portname); goto err_port_enter; } err = dev_open(port_dev, extack); if (err) { netdev_dbg(dev, "Device %s opening failed\n", portname); goto err_dev_open; } err = vlan_vids_add_by_dev(port_dev, dev); if (err) { netdev_err(dev, "Failed to add vlan ids to device %s\n", portname); goto err_vids_add; } err = team_port_enable_netpoll(port); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); goto err_enable_netpoll; } if (!(dev->features & NETIF_F_LRO)) dev_disable_lro(port_dev); err = netdev_rx_handler_register(port_dev, team_handle_frame, port); if (err) { netdev_err(dev, "Device %s failed to register rx_handler\n", portname); goto err_handler_register; } err = team_upper_dev_link(team, port, extack); if (err) { netdev_err(dev, "Device %s failed to set upper link\n", portname); goto err_set_upper_link; } err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", portname); goto err_option_port_add; } /* set promiscuity level to new slave */ if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(port_dev, 1); if (err) goto err_set_slave_promisc; } /* set allmulti level to new slave */ if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(port_dev, 1); if (err) { if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); goto err_set_slave_promisc; } } if (dev->flags & IFF_UP) { netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); dev_mc_sync_multiple(port_dev, dev); netif_addr_unlock_bh(dev); } port->index = -1; list_add_tail_rcu(&port->list, &team->port_list); team_port_enable(team, port); __team_compute_features(team); __team_port_change_port_added(port, !!netif_oper_up(port_dev)); __team_options_change_check(team); netdev_info(dev, "Port device %s added\n", portname); return 0; err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: team_upper_dev_unlink(team, port); err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: team_port_disable_netpoll(port); err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: dev_close(port_dev); err_dev_open: team_port_leave(team, port); team_port_set_orig_dev_addr(port); err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: kfree(port); return err; } static void __team_port_change_port_removed(struct team_port *port); static int team_port_del(struct team *team, struct net_device *port_dev) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; port = team_port_get_rtnl(port_dev); if (!port || !team_port_find(team, port)) { netdev_err(dev, "Device %s does not act as a port of this team\n", portname); return -ENOENT; } team_port_disable(team, port); list_del_rcu(&port->list); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(port_dev, -1); team_upper_dev_unlink(team, port); netdev_rx_handler_unregister(port_dev); team_port_disable_netpoll(port); vlan_vids_del_by_dev(port_dev, dev); if (dev->flags & IFF_UP) { dev_uc_unsync(port_dev, dev); dev_mc_unsync(port_dev, dev); } dev_close(port_dev); team_port_leave(team, port); __team_option_inst_mark_removed_port(team, port); __team_options_change_check(team); __team_option_inst_del_port(team, port); __team_port_change_port_removed(port); team_port_set_orig_dev_addr(port); dev_set_mtu(port_dev, port->orig.mtu); kfree_rcu(port, rcu); netdev_info(dev, "Port device %s removed\n", portname); __team_compute_features(team); return 0; } /***************** * Net device ops *****************/ static void team_mode_option_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.str_val = team->mode->kind; } static int team_mode_option_set(struct team *team, struct team_gsetter_ctx *ctx) { return team_change_mode(team, ctx->data.str_val); } static void team_notify_peers_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.count; } static int team_notify_peers_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.count = ctx->data.u32_val; return 0; } static void team_notify_peers_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.interval; } static int team_notify_peers_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.interval = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.count; } static int team_mcast_rejoin_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.count = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.interval; } static int team_mcast_rejoin_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.interval = ctx->data.u32_val; return 0; } static void team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = team_port_enabled(port); } static int team_port_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; if (ctx->data.bool_val) team_port_enable(team, port); else team_port_disable(team, port); return 0; } static void team_user_linkup_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup; } static void __team_carrier_check(struct team *team); static int team_user_linkup_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_user_linkup_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup_enabled; } static int team_user_linkup_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup_enabled = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_priority_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.s32_val = port->priority; } static int team_priority_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; s32 priority = ctx->data.s32_val; if (port->priority == priority) return 0; port->priority = priority; team_queue_override_port_prio_changed(team, port); return 0; } static void team_queue_id_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.u32_val = port->queue_id; } static int team_queue_id_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; u16 new_queue_id = ctx->data.u32_val; if (port->queue_id == new_queue_id) return 0; if (new_queue_id >= team->dev->real_num_tx_queues) return -EINVAL; team_queue_override_port_change_queue_id(team, port, new_queue_id); return 0; } static const struct team_option team_options[] = { { .name = "mode", .type = TEAM_OPTION_TYPE_STRING, .getter = team_mode_option_get, .setter = team_mode_option_set, }, { .name = "notify_peers_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_count_get, .setter = team_notify_peers_count_set, }, { .name = "notify_peers_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_interval_get, .setter = team_notify_peers_interval_set, }, { .name = "mcast_rejoin_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_count_get, .setter = team_mcast_rejoin_count_set, }, { .name = "mcast_rejoin_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_interval_get, .setter = team_mcast_rejoin_interval_set, }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_port_en_option_get, .setter = team_port_en_option_set, }, { .name = "user_linkup", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_option_get, .setter = team_user_linkup_option_set, }, { .name = "user_linkup_enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_en_option_get, .setter = team_user_linkup_en_option_set, }, { .name = "priority", .type = TEAM_OPTION_TYPE_S32, .per_port = true, .getter = team_priority_option_get, .setter = team_priority_option_set, }, { .name = "queue_id", .type = TEAM_OPTION_TYPE_U32, .per_port = true, .getter = team_queue_id_option_get, .setter = team_queue_id_option_set, }, }; static int team_init(struct net_device *dev) { struct team *team = netdev_priv(dev); int i; int err; team->dev = dev; team_set_no_mode(team); team->notifier_ctx = false; team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats); if (!team->pcpu_stats) return -ENOMEM; for (i = 0; i < TEAM_PORT_HASHENTRIES; i++) INIT_HLIST_HEAD(&team->en_port_hlist[i]); INIT_LIST_HEAD(&team->port_list); err = team_queue_override_init(team); if (err) goto err_team_queue_override_init; team_adjust_ops(team); INIT_LIST_HEAD(&team->option_list); INIT_LIST_HEAD(&team->option_inst_list); team_notify_peers_init(team); team_mcast_rejoin_init(team); err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) goto err_options_register; netif_carrier_off(dev); lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); netdev_lockdep_set_classes(dev); return 0; err_options_register: team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: free_percpu(team->pcpu_stats); return err; } static void team_uninit(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; struct team_port *tmp; mutex_lock(&team->lock); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); mutex_unlock(&team->lock); netdev_change_features(dev); lockdep_unregister_key(&team->team_lock_key); } static void team_destructor(struct net_device *dev) { struct team *team = netdev_priv(dev); free_percpu(team->pcpu_stats); } static int team_open(struct net_device *dev) { return 0; } static int team_close(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; list_for_each_entry(port, &team->port_list, list) { dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); } return 0; } /* * note: already called with rcu_read_lock */ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) { struct team *team = netdev_priv(dev); bool tx_success; unsigned int len = skb->len; tx_success = team_queue_override_transmit(team, skb); if (!tx_success) tx_success = team->ops.transmit(team, skb); if (tx_success) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(team->pcpu_stats->tx_dropped); } return NETDEV_TX_OK; } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* * This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the team driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* * Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static void team_change_rx_flags(struct net_device *dev, int change) { struct team *team = netdev_priv(dev); struct team_port *port; int inc; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (change & IFF_PROMISC) { inc = dev->flags & IFF_PROMISC ? 1 : -1; dev_set_promiscuity(port->dev, inc); } if (change & IFF_ALLMULTI) { inc = dev->flags & IFF_ALLMULTI ? 1 : -1; dev_set_allmulti(port->dev, inc); } } rcu_read_unlock(); } static void team_set_rx_mode(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { dev_uc_sync_multiple(port->dev, dev); dev_mc_sync_multiple(port->dev, dev); } rcu_read_unlock(); } static int team_set_mac_address(struct net_device *dev, void *p) { struct sockaddr *addr = p; struct team *team = netdev_priv(dev); struct team_port *port; if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); mutex_unlock(&team->lock); return 0; } static int team_change_mtu(struct net_device *dev, int new_mtu) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); if (err) { netdev_err(dev, "Device %s failed to change mtu", port->dev->name); goto unwind; } } team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); dev->mtu = new_mtu; return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); return err; } static void team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct team *team = netdev_priv(dev); struct team_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_dropped = 0, tx_dropped = 0, rx_nohandler = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* * rx_dropped, tx_dropped & rx_nohandler are u32, * updated without syncp protection. */ rx_dropped += READ_ONCE(p->rx_dropped); tx_dropped += READ_ONCE(p->tx_dropped); rx_nohandler += READ_ONCE(p->rx_nohandler); } stats->rx_dropped = rx_dropped; stats->tx_dropped = tx_dropped; stats->rx_nohandler = rx_nohandler; } static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } mutex_unlock(&team->lock); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return err; } static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return 0; } #ifdef CONFIG_NET_POLL_CONTROLLER static void team_poll_controller(struct net_device *dev) { } static void __team_netpoll_cleanup(struct team *team) { struct team_port *port; list_for_each_entry(port, &team->port_list, list) team_port_disable_netpoll(port); } static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); mutex_lock(&team->lock); __team_netpoll_cleanup(team); mutex_unlock(&team->lock); } static int team_netpoll_setup(struct net_device *dev, struct netpoll_info *npifo) { struct team *team = netdev_priv(dev); struct team_port *port; int err = 0; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { __team_netpoll_cleanup(team); break; } } mutex_unlock(&team->lock); return err; } #endif static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_add(team, port_dev, extack); mutex_unlock(&team->lock); if (!err) netdev_change_features(dev); return err; } static int team_del_slave(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_del(team, port_dev); mutex_unlock(&team->lock); if (err) return err; if (netif_is_team_master(port_dev)) { lockdep_unregister_key(&team->team_lock_key); lockdep_register_key(&team->team_lock_key); lockdep_set_class(&team->lock, &team->team_lock_key); } netdev_change_features(dev); return err; } static netdev_features_t team_fix_features(struct net_device *dev, netdev_features_t features) { struct team_port *port; struct team *team = netdev_priv(dev); netdev_features_t mask; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; features |= NETIF_F_ALL_FOR_ALL; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { features = netdev_increment_features(features, port->dev->features, mask); } rcu_read_unlock(); features = netdev_add_tso_features(features, mask); return features; } static int team_change_carrier(struct net_device *dev, bool new_carrier) { struct team *team = netdev_priv(dev); team->user_carrier_enabled = true; if (new_carrier) netif_carrier_on(dev); else netif_carrier_off(dev); return 0; } static const struct net_device_ops team_netdev_ops = { .ndo_init = team_init, .ndo_uninit = team_uninit, .ndo_open = team_open, .ndo_stop = team_close, .ndo_start_xmit = team_xmit, .ndo_select_queue = team_select_queue, .ndo_change_rx_flags = team_change_rx_flags, .ndo_set_rx_mode = team_set_rx_mode, .ndo_set_mac_address = team_set_mac_address, .ndo_change_mtu = team_change_mtu, .ndo_get_stats64 = team_get_stats64, .ndo_vlan_rx_add_vid = team_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = team_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = team_poll_controller, .ndo_netpoll_setup = team_netpoll_setup, .ndo_netpoll_cleanup = team_netpoll_cleanup, #endif .ndo_add_slave = team_add_slave, .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, .ndo_change_carrier = team_change_carrier, .ndo_features_check = passthru_features_check, }; /*********************** * ethtool interface ***********************/ static void team_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); strscpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version)); } static int team_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct team *team= netdev_priv(dev); unsigned long speed = 0; struct team_port *port; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (team_port_txable(port)) { if (port->state.speed != SPEED_UNKNOWN) speed += port->state.speed; if (cmd->base.duplex == DUPLEX_UNKNOWN && port->state.duplex != DUPLEX_UNKNOWN) cmd->base.duplex = port->state.duplex; } } rcu_read_unlock(); cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static const struct ethtool_ops team_ethtool_ops = { .get_drvinfo = team_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = team_ethtool_get_link_ksettings, }; /*********************** * rt netlink interface ***********************/ static void team_setup_by_port(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); if (port_dev->type == ARPHRD_ETHER) dev->header_ops = team->header_ops_cache; else dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; dev->needed_headroom = port_dev->needed_headroom; dev->addr_len = port_dev->addr_len; dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); eth_hw_addr_inherit(dev, port_dev); if (port_dev->flags & IFF_POINTOPOINT) { dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } else if ((port_dev->flags & (IFF_BROADCAST | IFF_MULTICAST)) == (IFF_BROADCAST | IFF_MULTICAST)) { dev->flags |= (IFF_BROADCAST | IFF_MULTICAST); dev->flags &= ~(IFF_POINTOPOINT | IFF_NOARP); } } static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); char *portname = port_dev->name; int err; if (dev->type == port_dev->type) return 0; if (!list_empty(&team->port_list)) { netdev_err(dev, "Device %s is of different type\n", portname); return -EBUSY; } err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } dev_uc_flush(dev); dev_mc_flush(dev); team_setup_by_port(dev, port_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); return 0; } static void team_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; team->header_ops_cache = dev->header_ops; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = team_destructor; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_TEAM; /* * Indicate we support unicast address filtering. That way core won't * bring us to promisc mode in case a unicast addr is added. * Let this up to underlay drivers. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GRO; /* Don't allow team devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; dev->hw_features = TEAM_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; dev->features |= dev->hw_features; dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } static int team_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); return register_netdevice(dev); } static int team_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static unsigned int team_get_num_tx_queues(void) { return TEAM_DEFAULT_NUM_TX_QUEUES; } static unsigned int team_get_num_rx_queues(void) { return TEAM_DEFAULT_NUM_RX_QUEUES; } static struct rtnl_link_ops team_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct team), .setup = team_setup, .newlink = team_newlink, .validate = team_validate, .get_num_tx_queues = team_get_num_tx_queues, .get_num_rx_queues = team_get_num_rx_queues, }; /*********************************** * Generic netlink custom interface ***********************************/ static struct genl_family team_nl_family; static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = { [TEAM_ATTR_UNSPEC] = { .type = NLA_UNSPEC, }, [TEAM_ATTR_TEAM_IFINDEX] = { .type = NLA_U32 }, [TEAM_ATTR_LIST_OPTION] = { .type = NLA_NESTED }, [TEAM_ATTR_LIST_PORT] = { .type = NLA_NESTED }, }; static const struct nla_policy team_nl_option_policy[TEAM_ATTR_OPTION_MAX + 1] = { [TEAM_ATTR_OPTION_UNSPEC] = { .type = NLA_UNSPEC, }, [TEAM_ATTR_OPTION_NAME] = { .type = NLA_STRING, .len = TEAM_STRING_MAX_LEN, }, [TEAM_ATTR_OPTION_CHANGED] = { .type = NLA_FLAG }, [TEAM_ATTR_OPTION_TYPE] = { .type = NLA_U8 }, [TEAM_ATTR_OPTION_DATA] = { .type = NLA_BINARY }, [TEAM_ATTR_OPTION_PORT_IFINDEX] = { .type = NLA_U32 }, [TEAM_ATTR_OPTION_ARRAY_INDEX] = { .type = NLA_U32 }, }; static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &team_nl_family, 0, TEAM_CMD_NOOP); if (!hdr) { err = -EMSGSIZE; goto err_msg_put; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_msg_put: nlmsg_free(msg); return err; } /* * Netlink cmd functions should be locked by following two functions. * Since dev gets held here, that ensures dev won't disappear in between. */ static struct team *team_nl_team_get(struct genl_info *info) { struct net *net = genl_info_net(info); int ifindex; struct net_device *dev; struct team *team; if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX]) return NULL; ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]); dev = dev_get_by_index(net, ifindex); if (!dev || dev->netdev_ops != &team_netdev_ops) { dev_put(dev); return NULL; } team = netdev_priv(dev); mutex_lock(&team->lock); return team; } static void team_nl_team_put(struct team *team) { mutex_unlock(&team->lock); dev_put(team->dev); } typedef int team_nl_send_func_t(struct sk_buff *skb, struct team *team, u32 portid); static int team_nl_send_unicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_unicast(dev_net(team->dev), skb, portid); } static int team_nl_fill_one_option_get(struct sk_buff *skb, struct team *team, struct team_option_inst *opt_inst) { struct nlattr *option_item; struct team_option *option = opt_inst->option; struct team_option_inst_info *opt_inst_info = &opt_inst->info; struct team_gsetter_ctx ctx; int err; ctx.info = opt_inst_info; err = team_option_get(team, opt_inst, &ctx); if (err) return err; option_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_OPTION); if (!option_item) return -EMSGSIZE; if (nla_put_string(skb, TEAM_ATTR_OPTION_NAME, option->name)) goto nest_cancel; if (opt_inst_info->port && nla_put_u32(skb, TEAM_ATTR_OPTION_PORT_IFINDEX, opt_inst_info->port->dev->ifindex)) goto nest_cancel; if (opt_inst->option->array_size && nla_put_u32(skb, TEAM_ATTR_OPTION_ARRAY_INDEX, opt_inst_info->array_index)) goto nest_cancel; switch (option->type) { case TEAM_OPTION_TYPE_U32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_U32)) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.u32_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_STRING: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_STRING)) goto nest_cancel; if (nla_put_string(skb, TEAM_ATTR_OPTION_DATA, ctx.data.str_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BINARY: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_BINARY)) goto nest_cancel; if (nla_put(skb, TEAM_ATTR_OPTION_DATA, ctx.data.bin_val.len, ctx.data.bin_val.ptr)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BOOL: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_FLAG)) goto nest_cancel; if (ctx.data.bool_val && nla_put_flag(skb, TEAM_ATTR_OPTION_DATA)) goto nest_cancel; break; case TEAM_OPTION_TYPE_S32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_S32)) goto nest_cancel; if (nla_put_s32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.s32_val)) goto nest_cancel; break; default: BUG(); } if (opt_inst->removed && nla_put_flag(skb, TEAM_ATTR_OPTION_REMOVED)) goto nest_cancel; if (opt_inst->changed) { if (nla_put_flag(skb, TEAM_ATTR_OPTION_CHANGED)) goto nest_cancel; opt_inst->changed = false; } nla_nest_end(skb, option_item); return 0; nest_cancel: nla_nest_cancel(skb, option_item); return -EMSGSIZE; } static int __send_and_alloc_skb(struct sk_buff **pskb, struct team *team, u32 portid, team_nl_send_func_t *send_func) { int err; if (*pskb) { err = send_func(*pskb, team, portid); if (err) return err; } *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!*pskb) return -ENOMEM; return 0; } static int team_nl_send_options_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct list_head *sel_opt_inst_list) { struct nlattr *option_list; struct nlmsghdr *nlh; void *hdr; struct team_option_inst *opt_inst; int err; struct sk_buff *skb = NULL; bool incomplete; int i; opt_inst = list_first_entry(sel_opt_inst_list, struct team_option_inst, tmp_list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; option_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_OPTION); if (!option_list) goto nla_put_failure; i = 0; incomplete = false; list_for_each_entry_from(opt_inst, sel_opt_inst_list, tmp_list) { err = team_nl_fill_one_option_get(skb, team, opt_inst); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } nla_nest_end(skb, option_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } static int team_nl_cmd_options_get(struct sk_buff *skb, struct genl_info *info) { struct team *team; struct team_option_inst *opt_inst; int err; LIST_HEAD(sel_opt_inst_list); team = team_nl_team_get(info); if (!team) return -EINVAL; list_for_each_entry(opt_inst, &team->option_inst_list, list) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); err = team_nl_send_options_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, &sel_opt_inst_list); team_nl_team_put(team); return err; } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list); static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err = 0; int i; struct nlattr *nl_option; rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } err = -EINVAL; if (!info->attrs[TEAM_ATTR_LIST_OPTION]) { err = -EINVAL; goto team_put; } nla_for_each_nested(nl_option, info->attrs[TEAM_ATTR_LIST_OPTION], i) { struct nlattr *opt_attrs[TEAM_ATTR_OPTION_MAX + 1]; struct nlattr *attr; struct nlattr *attr_data; LIST_HEAD(opt_inst_list); enum team_option_type opt_type; int opt_port_ifindex = 0; /* != 0 for per-port options */ u32 opt_array_index = 0; bool opt_is_array = false; struct team_option_inst *opt_inst; char *opt_name; bool opt_found = false; if (nla_type(nl_option) != TEAM_ATTR_ITEM_OPTION) { err = -EINVAL; goto team_put; } err = nla_parse_nested_deprecated(opt_attrs, TEAM_ATTR_OPTION_MAX, nl_option, team_nl_option_policy, info->extack); if (err) goto team_put; if (!opt_attrs[TEAM_ATTR_OPTION_NAME] || !opt_attrs[TEAM_ATTR_OPTION_TYPE]) { err = -EINVAL; goto team_put; } switch (nla_get_u8(opt_attrs[TEAM_ATTR_OPTION_TYPE])) { case NLA_U32: opt_type = TEAM_OPTION_TYPE_U32; break; case NLA_STRING: opt_type = TEAM_OPTION_TYPE_STRING; break; case NLA_BINARY: opt_type = TEAM_OPTION_TYPE_BINARY; break; case NLA_FLAG: opt_type = TEAM_OPTION_TYPE_BOOL; break; case NLA_S32: opt_type = TEAM_OPTION_TYPE_S32; break; default: goto team_put; } attr_data = opt_attrs[TEAM_ATTR_OPTION_DATA]; if (opt_type != TEAM_OPTION_TYPE_BOOL && !attr_data) { err = -EINVAL; goto team_put; } opt_name = nla_data(opt_attrs[TEAM_ATTR_OPTION_NAME]); attr = opt_attrs[TEAM_ATTR_OPTION_PORT_IFINDEX]; if (attr) opt_port_ifindex = nla_get_u32(attr); attr = opt_attrs[TEAM_ATTR_OPTION_ARRAY_INDEX]; if (attr) { opt_is_array = true; opt_array_index = nla_get_u32(attr); } list_for_each_entry(opt_inst, &team->option_inst_list, list) { struct team_option *option = opt_inst->option; struct team_gsetter_ctx ctx; struct team_option_inst_info *opt_inst_info; int tmp_ifindex; opt_inst_info = &opt_inst->info; tmp_ifindex = opt_inst_info->port ? opt_inst_info->port->dev->ifindex : 0; if (option->type != opt_type || strcmp(option->name, opt_name) || tmp_ifindex != opt_port_ifindex || (option->array_size && !opt_is_array) || opt_inst_info->array_index != opt_array_index) continue; opt_found = true; ctx.info = opt_inst_info; switch (opt_type) { case TEAM_OPTION_TYPE_U32: ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: if (nla_len(attr_data) > TEAM_STRING_MAX_LEN) { err = -EINVAL; goto team_put; } ctx.data.str_val = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BINARY: ctx.data.bin_val.len = nla_len(attr_data); ctx.data.bin_val.ptr = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BOOL: ctx.data.bool_val = attr_data ? true : false; break; case TEAM_OPTION_TYPE_S32: ctx.data.s32_val = nla_get_s32(attr_data); break; default: BUG(); } err = team_option_set(team, opt_inst, &ctx); if (err) goto team_put; opt_inst->changed = true; list_add(&opt_inst->tmp_list, &opt_inst_list); } if (!opt_found) { err = -ENOENT; goto team_put; } err = team_nl_send_event_options_get(team, &opt_inst_list); if (err) break; } team_put: team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static int team_nl_fill_one_port_get(struct sk_buff *skb, struct team_port *port) { struct nlattr *port_item; port_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_PORT); if (!port_item) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_PORT_IFINDEX, port->dev->ifindex)) goto nest_cancel; if (port->changed) { if (nla_put_flag(skb, TEAM_ATTR_PORT_CHANGED)) goto nest_cancel; port->changed = false; } if ((port->removed && nla_put_flag(skb, TEAM_ATTR_PORT_REMOVED)) || (port->state.linkup && nla_put_flag(skb, TEAM_ATTR_PORT_LINKUP)) || nla_put_u32(skb, TEAM_ATTR_PORT_SPEED, port->state.speed) || nla_put_u8(skb, TEAM_ATTR_PORT_DUPLEX, port->state.duplex)) goto nest_cancel; nla_nest_end(skb, port_item); return 0; nest_cancel: nla_nest_cancel(skb, port_item); return -EMSGSIZE; } static int team_nl_send_port_list_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct team_port *one_port) { struct nlattr *port_list; struct nlmsghdr *nlh; void *hdr; struct team_port *port; int err; struct sk_buff *skb = NULL; bool incomplete; int i; port = list_first_entry_or_null(&team->port_list, struct team_port, list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; port_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_PORT); if (!port_list) goto nla_put_failure; i = 0; incomplete = false; /* If one port is selected, called wants to send port list containing * only this port. Otherwise go through all listed ports and send all */ if (one_port) { err = team_nl_fill_one_port_get(skb, one_port); if (err) goto errout; } else if (port) { list_for_each_entry_from(port, &team->port_list, list) { err = team_nl_fill_one_port_get(skb, port); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } } nla_nest_end(skb, port_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } static int team_nl_cmd_port_list_get(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err; team = team_nl_team_get(info); if (!team) return -EINVAL; err = team_nl_send_port_list_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, NULL); team_nl_team_put(team); return err; } static const struct genl_small_ops team_nl_ops[] = { { .cmd = TEAM_CMD_NOOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_noop, }, { .cmd = TEAM_CMD_OPTIONS_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_options_set, .flags = GENL_ADMIN_PERM, }, { .cmd = TEAM_CMD_OPTIONS_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_options_get, .flags = GENL_ADMIN_PERM, }, { .cmd = TEAM_CMD_PORT_LIST_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_port_list_get, .flags = GENL_ADMIN_PERM, }, }; static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; static struct genl_family team_nl_family __ro_after_init = { .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = TEAM_ATTR_MAX, .policy = team_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = team_nl_ops, .n_small_ops = ARRAY_SIZE(team_nl_ops), .resv_start_op = TEAM_CMD_PORT_LIST_GET + 1, .mcgrps = team_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(team_nl_mcgrps), }; static int team_nl_send_multicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev), skb, 0, 0, GFP_KERNEL); } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list) { return team_nl_send_options_get(team, 0, 0, 0, team_nl_send_multicast, sel_opt_inst_list); } static int team_nl_send_event_port_get(struct team *team, struct team_port *port) { return team_nl_send_port_list_get(team, 0, 0, 0, team_nl_send_multicast, port); } static int __init team_nl_init(void) { return genl_register_family(&team_nl_family); } static void __exit team_nl_fini(void) { genl_unregister_family(&team_nl_family); } /****************** * Change checkers ******************/ static void __team_options_change_check(struct team *team) { int err; struct team_option_inst *opt_inst; LIST_HEAD(sel_opt_inst_list); list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->changed) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); } err = team_nl_send_event_options_get(team, &sel_opt_inst_list); if (err && err != -ESRCH) netdev_warn(team->dev, "Failed to send options change via netlink (err %d)\n", err); } /* rtnl lock is held */ static void __team_port_change_send(struct team_port *port, bool linkup) { int err; port->changed = true; port->state.linkup = linkup; team_refresh_port_linkup(port); if (linkup) { struct ethtool_link_ksettings ecmd; err = __ethtool_get_link_ksettings(port->dev, &ecmd); if (!err) { port->state.speed = ecmd.base.speed; port->state.duplex = ecmd.base.duplex; goto send_event; } } port->state.speed = 0; port->state.duplex = 0; send_event: err = team_nl_send_event_port_get(port->team, port); if (err && err != -ESRCH) netdev_warn(port->team->dev, "Failed to send port change of device %s via netlink (err %d)\n", port->dev->name, err); } static void __team_carrier_check(struct team *team) { struct team_port *port; bool team_linkup; if (team->user_carrier_enabled) return; team_linkup = false; list_for_each_entry(port, &team->port_list, list) { if (port->linkup) { team_linkup = true; break; } } if (team_linkup) netif_carrier_on(team->dev); else netif_carrier_off(team->dev); } static void __team_port_change_check(struct team_port *port, bool linkup) { if (port->state.linkup != linkup) __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_added(struct team_port *port, bool linkup) { __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_removed(struct team_port *port) { port->removed = true; __team_port_change_send(port, false); __team_carrier_check(port->team); } static void team_port_change_check(struct team_port *port, bool linkup) { struct team *team = port->team; mutex_lock(&team->lock); __team_port_change_check(port, linkup); mutex_unlock(&team->lock); } /************************************ * Net device notifier event handler ************************************/ static int team_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct team_port *port; port = team_port_get_rtnl(dev); if (!port) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (netif_oper_up(dev)) team_port_change_check(port, true); break; case NETDEV_DOWN: team_port_change_check(port, false); break; case NETDEV_CHANGE: if (netif_running(port->dev)) team_port_change_check(port, !!netif_oper_up(port->dev)); break; case NETDEV_UNREGISTER: team_del_slave(port->team->dev, dev); break; case NETDEV_FEAT_CHANGE: if (!port->team->notifier_ctx) { port->team->notifier_ctx = true; team_compute_features(port->team); port->team->notifier_ctx = false; } break; case NETDEV_PRECHANGEMTU: /* Forbid to change mtu of underlaying device */ if (!port->team->port_mtu_change_allowed) return NOTIFY_BAD; break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid to change type of underlaying device */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, port->team->dev); break; } return NOTIFY_DONE; } static struct notifier_block team_notifier_block __read_mostly = { .notifier_call = team_device_event, }; /*********************** * Module init and exit ***********************/ static int __init team_module_init(void) { int err; register_netdevice_notifier(&team_notifier_block); err = rtnl_link_register(&team_link_ops); if (err) goto err_rtnl_reg; err = team_nl_init(); if (err) goto err_nl_init; return 0; err_nl_init: rtnl_link_unregister(&team_link_ops); err_rtnl_reg: unregister_netdevice_notifier(&team_notifier_block); return err; } static void __exit team_module_exit(void) { team_nl_fini(); rtnl_link_unregister(&team_link_ops); unregister_netdevice_notifier(&team_notifier_block); } module_init(team_module_init); module_exit(team_module_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>"); MODULE_DESCRIPTION("Ethernet team device driver"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth SCO sockets. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/sco.h> static bool disable_esco; static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock) }; /* ---- SCO connections ---- */ struct sco_conn { struct hci_conn *hcon; spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; unsigned int mtu; }; #define sco_conn_lock(c) spin_lock(&c->lock) #define sco_conn_unlock(c) spin_unlock(&c->lock) static void sco_sock_close(struct sock *sk); static void sco_sock_kill(struct sock *sk); /* ----- SCO socket info ----- */ #define sco_pi(sk) ((struct sco_pinfo *) sk) struct sco_pinfo { struct bt_sock bt; bdaddr_t src; bdaddr_t dst; __u32 flags; __u16 setting; struct bt_codec codec; struct sco_conn *conn; }; /* ---- SCO timers ---- */ #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) static void sco_sock_timeout(struct work_struct *work) { struct sco_conn *conn = container_of(work, struct sco_conn, timeout_work.work); struct sock *sk; sco_conn_lock(conn); sk = conn->sk; if (sk) sock_hold(sk); sco_conn_unlock(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void sco_sock_set_timer(struct sock *sk, long timeout) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { if (!sco_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) { struct hci_dev *hdev = hcon->hdev; struct sco_conn *conn = hcon->sco_data; if (conn) { if (!conn->hcon) conn->hcon = hcon; return conn; } conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); if (!conn) return NULL; spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; if (hdev->sco_mtu > 0) conn->mtu = hdev->sco_mtu; else conn->mtu = 60; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. * Must be called on the locked socket. */ static void sco_chan_del(struct sock *sk, int err) { struct sco_conn *conn; conn = sco_pi(sk)->conn; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { sco_conn_lock(conn); conn->sk = NULL; sco_pi(sk)->conn = NULL; sco_conn_unlock(conn); if (conn->hcon) hci_conn_drop(conn->hcon); } sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_ZAPPED); } static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ sco_conn_lock(conn); sk = conn->sk; if (sk) sock_hold(sk); sco_conn_unlock(conn); if (sk) { lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); release_sock(sk); sock_put(sk); } /* Ensure no more work items will run before freeing conn. */ cancel_delayed_work_sync(&conn->timeout_work); hcon->sco_data = NULL; kfree(conn); } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); sco_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); } static int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { int err = 0; sco_conn_lock(conn); if (conn->sk) err = -EBUSY; else __sco_chan_add(conn, sk, parent); sco_conn_unlock(conn); return err; } static int sco_connect(struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { err = -EOPNOTSUPP; goto unlock; } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = sco_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&sco_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else { sk->sk_state = BT_CONNECT; sco_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int sco_send_frame(struct sock *sk, struct sk_buff *skb) { struct sco_conn *conn = sco_pi(sk)->conn; int len = skb->len; /* Check outgoing MTU */ if (len > conn->mtu) return -EINVAL; BT_DBG("sk %p len %d", sk, len); hci_send_sco(conn->hcon, skb); return len; } static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) { struct sock *sk; sco_conn_lock(conn); sk = conn->sk; sco_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba) { struct sock *sk; sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, ba)) return sk; } return NULL; } /* Find socket listening on source bdaddr. * Returns closest match. */ static struct sock *sco_get_sock_listen(bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; /* Exact match. */ if (!bacmp(&sco_pi(sk)->src, src)) break; /* Closest match */ if (!bacmp(&sco_pi(sk)->src, BDADDR_ANY)) sk1 = sk; } read_unlock(&sco_sk_list.lock); return sk ? sk : sk1; } static void sco_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void sco_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { sco_sock_close(sk); sco_sock_kill(sk); } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void sco_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void __sco_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: sco_sock_cleanup_listen(sk); break; case BT_CONNECTED: case BT_CONFIG: if (sco_pi(sk)->conn->hcon) { sk->sk_state = BT_DISCONN; sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); sco_conn_lock(sco_pi(sk)->conn); hci_conn_drop(sco_pi(sk)->conn->hcon); sco_pi(sk)->conn->hcon = NULL; sco_conn_unlock(sco_pi(sk)->conn); } else sco_chan_del(sk, ECONNRESET); break; case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void sco_sock_close(struct sock *sk) { lock_sock(sk); sco_sock_clear_timer(sk); __sco_sock_close(sk); release_sock(sk); } static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto sco_proto = { .name = "SCO", .owner = THIS_MODULE, .obj_size = sizeof(struct sco_pinfo) }; static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = sco_sock_destruct; sk->sk_sndtimeo = SCO_CONN_TIMEOUT; sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; sco_pi(sk)->codec.id = BT_CODEC_CVSD; sco_pi(sk)->codec.cid = 0xffff; sco_pi(sk)->codec.vid = 0xffff; sco_pi(sk)->codec.data_path = 0x00; bt_sock_link(&sco_sk_list, sk); return sk; } static int sco_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &sco_sock_ops; sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sco_sock_init(sk, NULL); return 0; } static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err = 0; if (!addr || addr_len < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); lock_sock(sk); if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } bacpy(&sco_pi(sk)->src, &sa->sco_bdaddr); sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) err = -EINVAL; lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); release_sock(sk); err = sco_connect(sk); if (err) return err; lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); release_sock(sk); return err; } static int sco_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; bdaddr_t *src = &sco_pi(sk)->src; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } write_lock(&sco_sk_list.lock); if (__sco_get_sock_listen_by_addr(src)) { err = -EADDRINUSE; goto unlock; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; unlock: write_unlock(&sco_sk_list.lock); done: release_sock(sk); return err; } static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst); else bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src); return sizeof(struct sockaddr_sco); } static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = sco_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) { struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; if (!lmp_esco_capable(hdev)) { struct hci_cp_accept_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = 0x00; /* Ignored */ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); } else { struct hci_cp_accept_sync_conn_req cp; bacpy(&cp.bdaddr, &conn->dst); cp.pkt_type = cpu_to_le16(conn->pkt_type); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.content_format = cpu_to_le16(setting); switch (setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (conn->pkt_type & ESCO_2EV3) cp.max_latency = cpu_to_le16(0x0008); else cp.max_latency = cpu_to_le16(0x000D); cp.retrans_effort = 0x02; break; case SCO_AIRMODE_CVSD: cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; default: /* use CVSD settings as fallback */ cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp), &cp); } } static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sco_pinfo *pi = sco_pi(sk); lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sco_conn_defer_accept(pi->conn->hcon, pi->setting); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; } release_sock(sk); return bt_sock_recvmsg(sock, msg, len, flags); } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; struct hci_dev *hdev; __u8 buffer[255]; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_VOICE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, sizeof(voice), optlen); if (copy_from_sockptr(&voice, optval, len)) { err = -EFAULT; break; } /* Explicitly check for these values */ if (voice.setting != BT_VOICE_TRANSPARENT && voice.setting != BT_VOICE_CVSD_16BIT) { err = -EINVAL; break; } sco_pi(sk)->setting = voice.setting; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (enhanced_sync_conn_capable(hdev) && voice.setting == BT_VOICE_TRANSPARENT) sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; hci_dev_put(hdev); break; case BT_PKT_STATUS: if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_CODEC: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (optlen < sizeof(struct bt_codecs) || optlen > sizeof(buffer)) { hci_dev_put(hdev); err = -EINVAL; break; } if (copy_from_sockptr(buffer, optval, optlen)) { hci_dev_put(hdev); err = -EFAULT; break; } codecs = (void *)buffer; if (codecs->num_codecs > 1) { hci_dev_put(hdev); err = -EINVAL; break; } sco_pi(sk)->codec = codecs->codecs[0]; hci_dev_put(hdev); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sco_options opts; struct sco_conninfo cinfo; int len, err = 0; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case SCO_OPTIONS: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } opts.mtu = sco_pi(sk)->conn->mtu; BT_DBG("mtu %u", opts.mtu); len = min_t(unsigned int, len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) err = -EFAULT; break; case SCO_CONNINFO: if (sk->sk_state != BT_CONNECTED && !(sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) { err = -ENOTCONN; break; } memset(&cinfo, 0, sizeof(cinfo)); cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); len = min_t(unsigned int, len, sizeof(cinfo)); if (copy_to_user(optval, (char *)&cinfo, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_voice voice; u32 phys; int buf_len; struct codec_list *c; u8 num_codecs, i, __user *ptr; struct hci_dev *hdev; struct hci_codec_caps *caps; struct bt_codec codec; BT_DBG("sk %p", sk); if (level == SOL_SCO) return sco_sock_getsockopt_old(sock, optname, optval, optlen); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_VOICE: voice.setting = sco_pi(sk)->setting; len = min_t(unsigned int, len, sizeof(voice)); if (copy_to_user(optval, (char *)&voice, len)) err = -EFAULT; break; case BT_PHY: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon); if (put_user(phys, (u32 __user *) optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_SNDMTU: case BT_RCVMTU: if (sk->sk_state != BT_CONNECTED) { err = -ENOTCONN; break; } if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval)) err = -EFAULT; break; case BT_CODEC: num_codecs = 0; buf_len = 0; hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) { err = -EBADFD; break; } if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } if (!hdev->get_data_path_id) { hci_dev_put(hdev); err = -EOPNOTSUPP; break; } release_sock(sk); /* find total buffer size required to copy codec + caps */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; num_codecs++; for (i = 0, caps = c->caps; i < c->num_caps; i++) { buf_len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } buf_len += sizeof(struct bt_codec); } hci_dev_unlock(hdev); buf_len += sizeof(struct bt_codecs); if (buf_len > len) { hci_dev_put(hdev); return -ENOBUFS; } ptr = optval; if (put_user(num_codecs, ptr)) { hci_dev_put(hdev); return -EFAULT; } ptr += sizeof(num_codecs); /* Iterate all the codecs supported over SCO and populate * codec data */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { if (c->transport != HCI_TRANSPORT_SCO_ESCO) continue; codec.id = c->id; codec.cid = c->cid; codec.vid = c->vid; err = hdev->get_data_path_id(hdev, &codec.data_path); if (err < 0) break; codec.num_caps = c->num_caps; if (copy_to_user(ptr, &codec, sizeof(codec))) { err = -EFAULT; break; } ptr += sizeof(codec); /* find codec capabilities data length */ len = 0; for (i = 0, caps = c->caps; i < c->num_caps; i++) { len += 1 + caps->len; caps = (void *)&caps->data[caps->len]; } /* copy codec capabilities data */ if (len && copy_to_user(ptr, c->caps, len)) { err = -EFAULT; break; } ptr += len; } hci_dev_unlock(hdev); hci_dev_put(hdev); lock_sock(sk); if (!err && put_user(buf_len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int sco_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sock_hold(sk); lock_sock(sk); if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; sco_sock_clear_timer(sk); __sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } release_sock(sk); sock_put(sk); return err; } static int sco_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; sco_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); sco_sock_kill(sk); return err; } static void sco_conn_ready(struct sco_conn *conn) { struct sock *parent; struct sock *sk = conn->sk; BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } else { sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); return; } parent = sco_get_sock_listen(&conn->hcon->src); if (!parent) { sco_conn_unlock(conn); return; } lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); sco_conn_unlock(conn); return; } sco_sock_init(sk, parent); bacpy(&sco_pi(sk)->src, &conn->hcon->src); bacpy(&sco_pi(sk)->dst, &conn->hcon->dst); hci_conn_hold(conn->hcon); __sco_chan_add(conn, sk, parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sco_conn_unlock(conn); } } /* ----- SCO interface with lower layer (HCI) ----- */ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct sock *sk; int lm = 0; BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr); /* Find listening sockets */ read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (!bacmp(&sco_pi(sk)->src, &hdev->bdaddr) || !bacmp(&sco_pi(sk)->src, BDADDR_ANY)) { lm |= HCI_LM_ACCEPT; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; break; } } read_unlock(&sco_sk_list.lock); return lm; } static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; conn = sco_conn_add(hcon); if (conn) sco_conn_ready(conn); } else sco_conn_del(hcon, bt_to_errno(status)); } static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); } void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { struct sco_conn *conn = hcon->sco_data; if (!conn) goto drop; BT_DBG("conn %p len %u", conn, skb->len); if (skb->len) { sco_recv_frame(conn, skb); return; } drop: kfree_skb(skb); } static struct hci_cb sco_cb = { .name = "SCO", .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; static int sco_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&sco_sk_list.lock); sk_for_each(sk, &sco_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &sco_pi(sk)->src, &sco_pi(sk)->dst, sk->sk_state); } read_unlock(&sco_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(sco_debugfs); static struct dentry *sco_debugfs; static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, .bind = sco_sock_bind, .connect = sco_sock_connect, .listen = sco_sock_listen, .accept = sco_sock_accept, .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = sco_sock_shutdown, .setsockopt = sco_sock_setsockopt, .getsockopt = sco_sock_getsockopt }; static const struct net_proto_family sco_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = sco_sock_create, }; int __init sco_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_sco) > sizeof(struct sockaddr)); err = proto_register(&sco_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops); if (err < 0) { BT_ERR("SCO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create SCO proc file"); bt_sock_unregister(BTPROTO_SCO); goto error; } BT_INFO("SCO socket layer initialized"); hci_register_cb(&sco_cb); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; sco_debugfs = debugfs_create_file("sco", 0444, bt_debugfs, NULL, &sco_debugfs_fops); return 0; error: proto_unregister(&sco_proto); return err; } void sco_exit(void) { bt_procfs_cleanup(&init_net, "sco"); debugfs_remove(sco_debugfs); hci_unregister_cb(&sco_cb); bt_sock_unregister(BTPROTO_SCO); proto_unregister(&sco_proto); } module_param(disable_esco, bool, 0644); MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation"); |
| 98 10 10 207 204 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | #undef TRACE_SYSTEM #define TRACE_SYSTEM neigh #if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NEIGH_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <net/neighbour.h> #define neigh_state_str(state) \ __print_symbolic(state, \ { NUD_INCOMPLETE, "incomplete" }, \ { NUD_REACHABLE, "reachable" }, \ { NUD_STALE, "stale" }, \ { NUD_DELAY, "delay" }, \ { NUD_PROBE, "probe" }, \ { NUD_FAILED, "failed" }, \ { NUD_NOARP, "noarp" }, \ { NUD_PERMANENT, "permanent"}) TRACE_EVENT(neigh_create, TP_PROTO(struct neigh_table *tbl, struct net_device *dev, const void *pkey, const struct neighbour *n, bool exempt_from_gc), TP_ARGS(tbl, dev, pkey, n, exempt_from_gc), TP_STRUCT__entry( __field(u32, family) __string(dev, dev ? dev->name : "NULL") __field(int, entries) __field(u8, created) __field(u8, gc_exempt) __array(u8, primary_key4, 4) __array(u8, primary_key6, 16) ), TP_fast_assign( __be32 *p32; __entry->family = tbl->family; __assign_str(dev, (dev ? dev->name : "NULL")); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; p32 = (__be32 *)__entry->primary_key4; if (tbl->family == AF_INET) *p32 = *(__be32 *)pkey; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (tbl->family == AF_INET6) { struct in6_addr *pin6; pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)pkey; } #endif ), TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d", __entry->family, __get_str(dev), __entry->entries, __entry->primary_key4, __entry->primary_key6, __entry->created, __entry->gc_exempt) ); TRACE_EVENT(neigh_update, TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid), TP_ARGS(n, lladdr, new, flags, nlmsg_pid), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __array(u8, new_lladdr, MAX_ADDR_LEN) __field(u8, new_state) __field(u32, update_flags) __field(u32, pid) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; if (lladdr) memcpy(__entry->new_lladdr, lladdr, lladdr_len); __entry->new_state = new; __entry->update_flags = flags; __entry->pid = nlmsg_pid; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu new_lladdr %s " "new_state %s update_flags %02x pid %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __print_hex_str(__entry->new_lladdr, __entry->lladdr_len), neigh_state_str(__entry->new_state), __entry->update_flags, __entry->pid) ); DECLARE_EVENT_CLASS(neigh__update, TP_PROTO(struct neighbour *n, int err), TP_ARGS(n, err), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __field(u32, err) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; __entry->err = err; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu err %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __entry->err) ); DEFINE_EVENT(neigh__update, neigh_update_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_timer_handler, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_dead, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_cleanup_and_release, TP_PROTO(struct neighbour *neigh, int rc), TP_ARGS(neigh, rc) ); #endif /* _TRACE_NEIGH_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 372 158 21 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_INTERNAL_H #define __CGROUP_INTERNAL_H #include <linux/cgroup.h> #include <linux/kernfs.h> #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/fs_parser.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; extern void __init enable_debug_cgroup(void); /* * cgroup_path() takes a spin lock. It is good practice not to take * spin locks within trace point handlers, as they are mostly hidden * from normal view. As cgroup_path() can take the kernfs_rename_lock * spin lock, it is best to not call that function from the trace event * handler. * * Note: trace_cgroup_##type##_enabled() is a static branch that will only * be set when the trace event is enabled. */ #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ unsigned long flags; \ spin_lock_irqsave(&trace_cgroup_path_lock, \ flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_cgroup_path_lock, \ flags); \ } \ } while (0) /* * The cgroup filesystem superblock creation/mount context. */ struct cgroup_fs_context { struct kernfs_fs_context kfc; struct cgroup_root *root; struct cgroup_namespace *ns; unsigned int flags; /* CGRP_ROOT_* flags */ /* cgroup1 bits */ bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ u16 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; return container_of(kfc, struct cgroup_fs_context, kfc); } struct cgroup_pidlist; struct cgroup_file_ctx { struct cgroup_namespace *ns; struct { void *trigger; } psi; struct { bool started; struct css_task_iter iter; } procs; struct { struct cgroup_pidlist *pidlist; } procs1; }; /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. * This M:N relationship is represented by the following link structure * which exists for each association and allows traversing the associations * from both sides. */ struct cgrp_cset_link { /* the cgroup and css_set this link associates */ struct cgroup *cgrp; struct css_set *cset; /* list of cgrp_cset_links anchored at cgrp->cset_links */ struct list_head cset_link; /* list of cgrp_cset_links anchored at css_set->cgrp_links */ struct list_head cgrp_link; }; /* used to track tasks and csets during migration */ struct cgroup_taskset { /* the src and dst cset list running through cset->mg_node */ struct list_head src_csets; struct list_head dst_csets; /* the number of tasks in the set */ int nr_tasks; /* the subsys currently being processed */ int ssid; /* * Fields for cgroup_taskset_*() iteration. * * Before migration is committed, the target migration tasks are on * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of * the csets on ->dst_csets. ->csets point to either ->src_csets * or ->dst_csets depending on whether migration is committed. * * ->cur_csets and ->cur_task point to the current task position * during iteration. */ struct list_head *csets; struct css_set *cur_cset; struct task_struct *cur_task; }; /* migration context also tracks preloading */ struct cgroup_mgctx { /* * Preloaded source and destination csets. Used to guarantee * atomic success or failure on actual migration. */ struct list_head preloaded_src_csets; struct list_head preloaded_dst_csets; /* tasks and csets to migrate */ struct cgroup_taskset tset; /* subsystems affected by migration */ u16 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ { \ .src_csets = LIST_HEAD_INIT(tset.src_csets), \ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ .csets = &tset.src_csets, \ } #define CGROUP_MGCTX_INIT(name) \ { \ LIST_HEAD_INIT(name.preloaded_src_csets), \ LIST_HEAD_INIT(name.preloaded_dst_csets), \ CGROUP_TASKSET_INIT(name.tset), \ } #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end */ #define for_each_subsys(ss, ssid) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) static inline bool cgroup_is_dead(const struct cgroup *cgrp) { return !(cgrp->self.flags & CSS_ONLINE); } static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } void put_css_set_locked(struct css_set *cset); static inline void put_css_set(struct css_set *cset) { unsigned long flags; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); spin_unlock_irqrestore(&css_set_lock, flags); } /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cset) { refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root); struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); void cgroup_attach_lock(bool lock_threadgroup); void cgroup_attach_unlock(bool lock_threadgroup); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *locked) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); void cgroup_rstat_boot(void); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c */ extern const struct proc_ns_operations cgroupns_operations; /* * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; extern const struct fs_parameter_spec cgroup1_fs_parameters[]; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); int cgroup1_get_tree(struct fs_context *fc); int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */ |
| 4 130 107 227 178 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ ipv6_addr_set_v4mapped(saddr, pin6); \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ ipv6_addr_set_v4mapped(daddr, pin6); \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ do { \ if (sk->sk_family == AF_INET6) { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ *pin6 = saddr6; \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ *pin6 = daddr6; \ } else { \ TP_STORE_V4MAPPED(__entry, saddr, daddr); \ } \ } while (0) #else #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ do { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in *v4 = (void *)__entry->saddr; \ \ v4->sin_family = AF_INET; \ v4->sin_port = th->source; \ v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ v4 = (void *)__entry->daddr; \ v4->sin_family = AF_INET; \ v4->sin_port = th->dest; \ v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ do { \ const struct iphdr *iph = ip_hdr(skb); \ \ if (iph->version == 6) { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->source; \ v6->sin6_addr = ipv6_hdr(skb)->saddr; \ v6 = (void *)__entry->daddr; \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->dest; \ v6->sin6_addr = ipv6_hdr(skb)->daddr; \ } else \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ } while (0) #else #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) #endif /* * tcp event with only skb */ DECLARE_EVENT_CLASS(tcp_event_skb, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field(const void *, skbaddr) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(__entry, skb); ), TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr) ); DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); TRACE_EVENT(tcp_cong_state_set, TP_PROTO(struct sock *sk, const u8 ca_state), TP_ARGS(sk, ca_state), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u8, cong_state) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->cong_state = ca_state; ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->cong_state) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 54 45 3 1 1 5 8 8 8 4 4 4 4 4 4 4 47 47 47 47 47 47 3 3 3 3 3 3 3 3 3 6 6 6 6 4 4 51 51 1 1 8 8 4 4 4 45 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2019, 2021 - 2023 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #define check_sdata_in_driver(sdata) ({ \ WARN_ONCE(!sdata->local->reconfig_failure && \ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER), \ "%s: Failed check-sdata-in-driver check, flags: 0x%x\n", \ sdata->dev ? sdata->dev->name : sdata->name, sdata->flags); \ !!(sdata->flags & IEEE80211_SDATA_IN_DRIVER); \ }) static inline struct ieee80211_sub_if_data * get_bss_sdata(struct ieee80211_sub_if_data *sdata) { if (sdata && sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); return sdata; } static inline void drv_tx(struct ieee80211_local *local, struct ieee80211_tx_control *control, struct sk_buff *skb) { local->ops->tx(&local->hw, control, skb); } static inline void drv_sync_rx_queues(struct ieee80211_local *local, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->sync_rx_queues) { trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta); local->ops->sync_rx_queues(&local->hw); trace_drv_return_void(local); } } static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata, u32 sset, u8 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_strings) { trace_drv_get_et_strings(local, sset); local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data); trace_drv_return_void(local); } } static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata, struct ethtool_stats *stats, u64 *data) { struct ieee80211_local *local = sdata->local; if (local->ops->get_et_stats) { trace_drv_get_et_stats(local); local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data); trace_drv_return_void(local); } } static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata, int sset) { struct ieee80211_local *local = sdata->local; int rv = 0; if (local->ops->get_et_sset_count) { trace_drv_get_et_sset_count(local, sset); rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif, sset); trace_drv_return_int(local, rv); } return rv; } int drv_start(struct ieee80211_local *local); void drv_stop(struct ieee80211_local *local); #ifdef CONFIG_PM static inline int drv_suspend(struct ieee80211_local *local, struct cfg80211_wowlan *wowlan) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_suspend(local); ret = local->ops->suspend(&local->hw, wowlan); trace_drv_return_int(local, ret); return ret; } static inline int drv_resume(struct ieee80211_local *local) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_resume(local); ret = local->ops->resume(&local->hw); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_wakeup(struct ieee80211_local *local, bool enabled) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->set_wakeup) return; trace_drv_set_wakeup(local, enabled); local->ops->set_wakeup(&local->hw, enabled); trace_drv_return_void(local); } #endif int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p); void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_config(struct ieee80211_local *local, u32 changed) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_config(local, changed); ret = local->ops->config(&local->hw, changed); trace_drv_return_int(local, ret); return ret; } static inline void drv_vif_cfg_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_vif_cfg_changed(local, sdata, changed); if (local->ops->vif_cfg_changed) local->ops->vif_cfg_changed(&local->hw, &sdata->vif, changed); else if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, &sdata->vif.bss_conf, changed); trace_drv_return_void(local); } void drv_link_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, int link_id, u64 changed); static inline u64 drv_prepare_multicast(struct ieee80211_local *local, struct netdev_hw_addr_list *mc_list) { u64 ret = 0; trace_drv_prepare_multicast(local, mc_list->count); if (local->ops->prepare_multicast) ret = local->ops->prepare_multicast(&local->hw, mc_list); trace_drv_return_u64(local, ret); return ret; } static inline void drv_configure_filter(struct ieee80211_local *local, unsigned int changed_flags, unsigned int *total_flags, u64 multicast) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_configure_filter(local, changed_flags, total_flags, multicast); local->ops->configure_filter(&local->hw, changed_flags, total_flags, multicast); trace_drv_return_void(local); } static inline void drv_config_iface_filter(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int filter_flags, unsigned int changed_flags) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_config_iface_filter(local, sdata, filter_flags, changed_flags); if (local->ops->config_iface_filter) local->ops->config_iface_filter(&local->hw, &sdata->vif, filter_flags, changed_flags); trace_drv_return_void(local); } static inline int drv_set_tim(struct ieee80211_local *local, struct ieee80211_sta *sta, bool set) { int ret = 0; trace_drv_set_tim(local, sta, set); if (local->ops->set_tim) ret = local->ops->set_tim(&local->hw, sta, set); trace_drv_return_int(local, ret); return ret; } int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); static inline void drv_update_tkip_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_key_conf *conf, struct sta_info *sta, u32 iv32, u16 *phase1key) { struct ieee80211_sta *ista = NULL; if (sta) ista = &sta->sta; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); if (local->ops->update_tkip_key) local->ops->update_tkip_key(&local->hw, &sdata->vif, conf, ista, iv32, phase1key); trace_drv_return_void(local); } static inline int drv_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_request *req) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_hw_scan(local, sdata); ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); trace_drv_return_int(local, ret); return ret; } static inline void drv_cancel_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_cancel_hw_scan(local, sdata); local->ops->cancel_hw_scan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_sched_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_start(local, sdata); ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, req, ies); trace_drv_return_int(local, ret); return ret; } static inline int drv_sched_scan_stop(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sched_scan_stop(local, sdata); ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_sw_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const u8 *mac_addr) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_sw_scan_start(local, sdata, mac_addr); if (local->ops->sw_scan_start) local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr); trace_drv_return_void(local); } static inline void drv_sw_scan_complete(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_sw_scan_complete(local, sdata); if (local->ops->sw_scan_complete) local->ops->sw_scan_complete(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_get_stats(struct ieee80211_local *local, struct ieee80211_low_level_stats *stats) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_stats) ret = local->ops->get_stats(&local->hw, stats); trace_drv_get_stats(local, stats, ret); return ret; } static inline void drv_get_key_seq(struct ieee80211_local *local, struct ieee80211_key *key, struct ieee80211_key_seq *seq) { if (local->ops->get_key_seq) local->ops->get_key_seq(&local->hw, &key->conf, seq); trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_frag_threshold(local, value); if (local->ops->set_frag_threshold) ret = local->ops->set_frag_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_rts_threshold(local, value); if (local->ops->set_rts_threshold) ret = local->ops->set_rts_threshold(&local->hw, value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, s16 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_coverage_class(local, value); if (local->ops->set_coverage_class) local->ops->set_coverage_class(&local->hw, value); else ret = -EOPNOTSUPP; trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_notify(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum sta_notify_cmd cmd, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_notify(local, sdata, cmd, sta); if (local->ops->sta_notify) local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta); trace_drv_return_void(local); } static inline int drv_sta_add(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_add(local, sdata, sta); if (local->ops->sta_add) ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); trace_drv_return_int(local, ret); return ret; } static inline void drv_sta_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_remove(local, sdata, sta); if (local->ops->sta_remove) local->ops->sta_remove(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } #ifdef CONFIG_MAC80211_DEBUGFS static inline void drv_vif_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (sdata->vif.type == NL80211_IFTYPE_MONITOR || WARN_ON(!sdata->vif.debugfs_dir)) return; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->vif_add_debugfs) local->ops->vif_add_debugfs(&local->hw, &sdata->vif); } static inline void drv_link_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->link_add_debugfs) local->ops->link_add_debugfs(&local->hw, &sdata->vif, link_conf, dir); } static inline void drv_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->sta_add_debugfs) local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, struct dentry *dir) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; if (local->ops->link_sta_add_debugfs) local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif, link_sta, dir); } #else static inline void drv_vif_add_debugfs(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); } #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta); if (local->ops->sta_pre_rcu_remove) local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_rate_tbl_update(local, sdata, sta); if (local->ops->sta_rate_tbl_update) local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_sta_statistics(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct station_info *sinfo) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_statistics(local, sdata, sta); if (local->ops->sta_statistics) local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params); u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); static inline int drv_tx_last_beacon(struct ieee80211_local *local) { int ret = 0; /* default unsupported op for less congestion */ might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_tx_last_beacon(local); if (local->ops->tx_last_beacon) ret = local->ops->tx_last_beacon(&local->hw); trace_drv_return_int(local, ret); return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params); static inline int drv_get_survey(struct ieee80211_local *local, int idx, struct survey_info *survey) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_get_survey(local, idx, survey); if (local->ops->get_survey) ret = local->ops->get_survey(&local->hw, idx, survey); trace_drv_return_int(local, ret); return ret; } static inline void drv_rfkill_poll(struct ieee80211_local *local) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->rfkill_poll) local->ops->rfkill_poll(&local->hw); } static inline void drv_flush(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u32 queues, bool drop) { struct ieee80211_vif *vif; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); vif = sdata ? &sdata->vif : NULL; if (sdata && !check_sdata_in_driver(sdata)) return; trace_drv_flush(local, queues, drop); if (local->ops->flush) local->ops->flush(&local->hw, vif, queues, drop); trace_drv_return_void(local); } static inline void drv_flush_sta(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (sdata && !check_sdata_in_driver(sdata)) return; trace_drv_flush_sta(local, sdata, &sta->sta); if (local->ops->flush_sta) local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_void(local); } static inline void drv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_channel_switch(local, sdata, ch_switch); local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_void(local); } static inline int drv_set_antenna(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->set_antenna) ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } static inline int drv_get_antenna(struct ieee80211_local *local, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_antenna) ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); return ret; } static inline int drv_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, unsigned int duration, enum ieee80211_roc_type type) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_remain_on_channel(local, sdata, chan, duration, type); ret = local->ops->remain_on_channel(&local->hw, &sdata->vif, chan, duration, type); trace_drv_return_int(local, ret); return ret; } static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_cancel_remain_on_channel(local, sdata); ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_ringparam(struct ieee80211_local *local, u32 tx, u32 rx) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_set_ringparam(local, tx, rx); if (local->ops->set_ringparam) ret = local->ops->set_ringparam(&local->hw, tx, rx); trace_drv_return_int(local, ret); return ret; } static inline void drv_get_ringparam(struct ieee80211_local *local, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max); if (local->ops->get_ringparam) local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max); trace_drv_return_void(local); } static inline bool drv_tx_frames_pending(struct ieee80211_local *local) { bool ret = false; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_tx_frames_pending(local); if (local->ops->tx_frames_pending) ret = local->ops->tx_frames_pending(&local->hw); trace_drv_return_bool(local, ret); return ret; } static inline int drv_set_bitrate_mask(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_bitrate_mask *mask) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_set_bitrate_mask(local, sdata, mask); if (local->ops->set_bitrate_mask) ret = local->ops->set_bitrate_mask(&local->hw, &sdata->vif, mask); trace_drv_return_int(local, ret); return ret; } static inline void drv_set_rekey_data(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_gtk_rekey_data *data) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) local->ops->set_rekey_data(&local->hw, &sdata->vif, data); trace_drv_return_void(local); } static inline void drv_event_callback(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct ieee80211_event *event) { trace_drv_event_callback(local, sdata, event); if (local->ops->event_callback) local->ops->event_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } static inline void drv_release_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->release_buffered_frames) local->ops->release_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_allow_buffered_frames(struct ieee80211_local *local, struct sta_info *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data) { trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames, reason, more_data); if (local->ops->allow_buffered_frames) local->ops->allow_buffered_frames(&local->hw, &sta->sta, tids, num_frames, reason, more_data); trace_drv_return_void(local); } static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); info->link_id = info->link_id < 0 ? 0 : info->link_id; trace_drv_mgd_prepare_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_prepare_tx) local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_complete_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info *info) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } static inline void drv_mgd_protect_tdls_discover(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int link_id) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); link_id = link_id > 0 ? link_id : 0; trace_drv_mgd_protect_tdls_discover(local, sdata); if (local->ops->mgd_protect_tdls_discover) local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif, link_id); trace_drv_return_void(local); } static inline int drv_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_add_chanctx(local, ctx); if (local->ops->add_chanctx) ret = local->ops->add_chanctx(&local->hw, &ctx->conf); trace_drv_return_int(local, ret); if (!ret) ctx->driver_present = true; return ret; } static inline void drv_remove_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!ctx->driver_present)) return; trace_drv_remove_chanctx(local, ctx); if (local->ops->remove_chanctx) local->ops->remove_chanctx(&local->hw, &ctx->conf); trace_drv_return_void(local); ctx->driver_present = false; } static inline void drv_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, u32 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_change_chanctx(local, ctx, changed); if (local->ops->change_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->change_chanctx(&local->hw, &ctx->conf, changed); } trace_drv_return_void(local); } int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx); void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx); int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); static inline int drv_start_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_ap(local, sdata, link_conf); if (local->ops->start_ap) ret = local->ops->start_ap(&local->hw, &sdata->vif, link_conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_ap(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_stop_ap(local, sdata, link_conf); if (local->ops->stop_ap) local->ops->stop_ap(&local->hw, &sdata->vif, link_conf); trace_drv_return_void(local); } static inline void drv_reconfig_complete(struct ieee80211_local *local, enum ieee80211_reconfig_type reconfig_type) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); trace_drv_reconfig_complete(local, reconfig_type); if (local->ops->reconfig_complete) local->ops->reconfig_complete(&local->hw, reconfig_type); trace_drv_return_void(local); } static inline void drv_set_default_unicast_key(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int key_idx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; WARN_ON_ONCE(key_idx < -1 || key_idx > 3); trace_drv_set_default_unicast_key(local, sdata, key_idx); if (local->ops->set_default_unicast_key) local->ops->set_default_unicast_key(&local->hw, &sdata->vif, key_idx); trace_drv_return_void(local); } #if IS_ENABLED(CONFIG_IPV6) static inline void drv_ipv6_addr_change(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct inet6_dev *idev) { trace_drv_ipv6_addr_change(local, sdata); if (local->ops->ipv6_addr_change) local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev); trace_drv_return_void(local); } #endif static inline void drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->channel_switch_beacon) { trace_drv_channel_switch_beacon(local, sdata, chandef); local->ops->channel_switch_beacon(&local->hw, &sdata->vif, chandef); } } static inline int drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_pre_channel_switch(local, sdata, ch_switch); if (local->ops->pre_channel_switch) ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif, ch_switch); trace_drv_return_int(local, ret); return ret; } static inline int drv_post_channel_switch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_post_channel_switch(local, sdata); if (local->ops->post_channel_switch) ret = local->ops->post_channel_switch(&local->hw, &sdata->vif, link->conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) local->ops->abort_channel_switch(&local->hw, &sdata->vif); } static inline void drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch) { struct ieee80211_local *local = sdata->local; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch); if (local->ops->channel_switch_rx_beacon) local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif, ch_switch); } static inline int drv_join_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf); if (local->ops->join_ibss) ret = local->ops->join_ibss(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); return ret; } static inline void drv_leave_ibss(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_leave_ibss(local, sdata); if (local->ops->leave_ibss) local->ops->leave_ibss(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, struct sta_info *sta) { u32 ret = 0; trace_drv_get_expected_throughput(&sta->sta); if (local->ops->get_expected_throughput && sta->uploaded) ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; } static inline int drv_get_txpower(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int *dbm) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->get_txpower) return -EOPNOTSUPP; ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm); trace_drv_get_txpower(local, sdata, *dbm, ret); return ret; } static inline int drv_tdls_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!local->ops->tdls_channel_switch) return -EOPNOTSUPP; trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef); ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta, oper_class, chandef, tmpl_skb, ch_sw_tm_ie); trace_drv_return_int(local, ret); return ret; } static inline void drv_tdls_cancel_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->tdls_cancel_channel_switch) return; trace_drv_tdls_cancel_channel_switch(local, sdata, sta); local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta); trace_drv_return_void(local); } static inline void drv_tdls_recv_channel_switch(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_tdls_ch_sw_params *params) { trace_drv_tdls_recv_channel_switch(local, sdata, params); if (local->ops->tdls_recv_channel_switch) local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif, params); trace_drv_return_void(local); } static inline void drv_wake_tx_queue(struct ieee80211_local *local, struct txq_info *txq) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); /* In reconfig don't transmit now, but mark for waking later */ if (local->in_reconfig) { set_bit(IEEE80211_TXQ_DIRTY, &txq->flags); return; } if (!check_sdata_in_driver(sdata)) return; trace_drv_wake_tx_queue(local, sdata, txq); local->ops->wake_tx_queue(&local->hw, &txq->txq); } static inline void schedule_and_wake_txq(struct ieee80211_local *local, struct txq_info *txqi) { ieee80211_schedule_txq(&local->hw, &txqi->txq); drv_wake_tx_queue(local, txqi); } static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local, struct sk_buff *head, struct sk_buff *skb) { if (!local->ops->can_aggregate_in_amsdu) return true; return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb); } static inline int drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { u32 ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (local->ops->get_ftm_responder_stats) ret = local->ops->get_ftm_responder_stats(&local->hw, &sdata->vif, ftm_stats); trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats); return ret; } static inline int drv_start_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_start_pmsr(local, sdata); if (local->ops->start_pmsr) ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_int(local, ret); return ret; } static inline void drv_abort_pmsr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_pmsr_request *request) { trace_drv_abort_pmsr(local, sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (local->ops->abort_pmsr) local->ops->abort_pmsr(&local->hw, &sdata->vif, request); trace_drv_return_void(local); } static inline int drv_start_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_start_nan(local, sdata, conf); ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); trace_drv_return_int(local, ret); return ret; } static inline void drv_stop_nan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_stop_nan(local, sdata); local->ops->stop_nan(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline int drv_nan_change_conf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_nan_conf *conf, u32 changes) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->nan_change_conf) return -EOPNOTSUPP; trace_drv_nan_change_conf(local, sdata, conf, changes); ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, changes); trace_drv_return_int(local, ret); return ret; } static inline int drv_add_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, const struct cfg80211_nan_func *nan_func) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->add_nan_func) return -EOPNOTSUPP; trace_drv_add_nan_func(local, sdata, nan_func); ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); trace_drv_return_int(local, ret); return ret; } static inline void drv_del_nan_func(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u8 instance_id) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); trace_drv_del_nan_func(local, sdata, instance_id); if (local->ops->del_nan_func) local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); trace_drv_return_void(local); } static inline int drv_set_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta, tid_conf); trace_drv_return_int(local, ret); return ret; } static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; } static inline void drv_update_vif_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); check_sdata_in_driver(sdata); if (!local->ops->update_vif_offload) return; trace_drv_update_vif_offload(local, sdata); local->ops->update_vif_offload(&local->hw, &sdata->vif); trace_drv_return_void(local); } static inline void drv_sta_set_4addr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_4addr(local, sdata, sta, enabled); if (local->ops->sta_set_4addr) local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, bool enabled) { sdata = get_bss_sdata(sdata); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_sta_set_decap_offload(local, sdata, sta, enabled); if (local->ops->sta_set_decap_offload) local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta, enabled); trace_drv_return_void(local); } static inline void drv_add_twt_setup(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt) { struct ieee80211_twt_params *twt_agrt; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; twt_agrt = (void *)twt->params; trace_drv_add_twt_setup(local, sta, twt, twt_agrt); local->ops->add_twt_setup(&local->hw, sta, twt); trace_drv_return_void(local); } static inline void drv_twt_teardown_request(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u8 flowid) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; if (!local->ops->twt_teardown_request) return; trace_drv_twt_teardown_request(local, sta, flowid); local->ops->twt_teardown_request(&local->hw, sta, flowid); trace_drv_return_void(local); } static inline int drv_net_fill_forward_path(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path) { int ret = -EOPNOTSUPP; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_net_fill_forward_path(local, sdata, sta); if (local->ops->net_fill_forward_path) ret = local->ops->net_fill_forward_path(&local->hw, &sdata->vif, sta, ctx, path); trace_drv_return_int(local, ret); return ret; } static inline int drv_net_setup_tc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct net_device *dev, enum tc_setup_type type, void *type_data) { int ret = -EOPNOTSUPP; might_sleep(); sdata = get_bss_sdata(sdata); trace_drv_net_setup_tc(local, sdata, type); if (local->ops->net_setup_tc) ret = local->ops->net_setup_tc(&local->hw, &sdata->vif, dev, type, type_data); trace_drv_return_int(local, ret); return ret; } static inline bool drv_can_activate_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 active_links) { bool ret = true; lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return false; trace_drv_can_activate_links(local, sdata, active_links); if (local->ops->can_activate_links) ret = local->ops->can_activate_links(&local->hw, &sdata->vif, active_links); trace_drv_return_bool(local, ret); return ret; } int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links); #endif /* __MAC80211_DRIVER_OPS */ |
| 43 37 1251 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* * include/linux/ktime.h * * ktime_t - nanosecond-resolution time format. * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * * Roman Zippel provided the ideas and primary code snippets of * the ktime_t union and further simplifications of the original * code. * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H #include <asm/bug.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value * @secs: seconds to set * @nsecs: nanoseconds to set * * Return: The ktime_t representation of the value. */ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) { if (unlikely(secs >= KTIME_SEC_MAX)) return KTIME_MAX; return secs * NSEC_PER_SEC + (s64)nsecs; } /* Subtract two ktime_t variables. rem = lhs -rhs: */ #define ktime_sub(lhs, rhs) ((lhs) - (rhs)) /* Add two ktime_t variables. res = lhs + rhs: */ #define ktime_add(lhs, rhs) ((lhs) + (rhs)) /* * Same as ktime_add(), but avoids undefined behaviour on overflow; however, * this means that you must check the result for overflow yourself. */ #define ktime_add_unsafe(lhs, rhs) ((u64) (lhs) + (rhs)) /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: */ #define ktime_add_ns(kt, nsval) ((kt) + (nsval)) /* * Subtract a scalar nanosecod from a ktime_t variable * res = kt - nsval: */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { return kt; } /** * ktime_compare - Compares two ktime_t variables for less, greater or equal * @cmp1: comparable1 * @cmp2: comparable2 * * Return: ... * cmp1 < cmp2: return <0 * cmp1 == cmp2: return 0 * cmp1 > cmp2: return >0 */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { if (cmp1 < cmp2) return -1; if (cmp1 > cmp2) return 1; return 0; } /** * ktime_after - Compare if a ktime_t value is bigger than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened after cmp2. */ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) > 0; } /** * ktime_before - Compare if a ktime_t value is smaller than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened before cmp2. */ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) < 0; } #if BITS_PER_LONG < 64 extern s64 __ktime_divns(const ktime_t kt, s64 div); static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * Negative divisors could cause an inf loop, * so bug out here. */ BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { s64 ns = kt; u64 tmp = ns < 0 ? -ns : ns; do_div(tmp, div); return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * 32-bit implementation cannot handle negative divisors, * so catch them on 64bit as well. */ WARN_ON(div < 0); return kt / div; } #endif static inline s64 ktime_to_us(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_USEC); } static inline s64 ktime_to_ms(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_MSEC); } static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_us(ktime_sub(later, earlier)); } static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_ms(ktime_sub(later, earlier)); } static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) { return ktime_add_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) { return ktime_add_ns(kt, msec * NSEC_PER_MSEC); } static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) { return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); } extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data * @kt: the ktime_t variable to convert * @ts: the timespec variable to store the result in * * Return: %true if there was a successful conversion, %false if kt was 0. */ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt, struct timespec64 *ts) { if (kt) { *ts = ktime_to_timespec64(kt); return true; } else { return false; } } #include <vdso/ktime.h> static inline ktime_t ns_to_ktime(u64 ns) { return ns; } static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; } # include <linux/timekeeping.h> #endif |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/caif/cfpkt.h> #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 #define PKT_ERROR(pkt, errmsg) \ do { \ cfpkt_priv(pkt)->erronous = true; \ skb_reset_tail_pointer(&pkt->skb); \ pr_warn(errmsg); \ } while (0) struct cfpktq { struct sk_buff_head head; atomic_t count; /* Lock protects count updates */ spinlock_t lock; }; /* * net/caif/ is generic and does not * understand SKB, so we do this typecast */ struct cfpkt { struct sk_buff skb; }; /* Private data inside SKB */ struct cfpkt_priv_data { struct dev_info dev_info; bool erronous; }; static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt) { return (struct cfpkt_priv_data *) pkt->skb.cb; } static inline bool is_erronous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt) { return &pkt->skb; } static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) { return (struct cfpkt *) skb; } struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) { struct cfpkt *pkt = skb_to_pkt(nativepkt); cfpkt_priv(pkt)->erronous = false; return pkt; } EXPORT_SYMBOL(cfpkt_fromnative); void *cfpkt_tonative(struct cfpkt *pkt) { return (void *) pkt; } EXPORT_SYMBOL(cfpkt_tonative); static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx) { struct sk_buff *skb; skb = alloc_skb(len + pfx, GFP_ATOMIC); if (unlikely(skb == NULL)) return NULL; skb_reserve(skb, pfx); return skb_to_pkt(skb); } inline struct cfpkt *cfpkt_create(u16 len) { return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX); } void cfpkt_destroy(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); kfree_skb(skb); } inline bool cfpkt_more(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len > 0; } int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (skb_headlen(skb) >= len) { memcpy(data, skb->data, len); return 0; } return !cfpkt_extr_head(pkt, data, len) && !cfpkt_add_head(pkt, data, len); } int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(len > skb->len)) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } from = skb_pull(skb, len); from -= len; if (data) memcpy(data, from, len); return 0; } EXPORT_SYMBOL(cfpkt_extr_head); int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *data = dta; u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; skb_trim(skb, skb->len - len); memcpy(data, from, len); return 0; } int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) { return cfpkt_add_body(pkt, NULL, len); } int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; u16 addlen = 0; if (unlikely(is_erronous(pkt))) return -EPROTO; lastskb = skb; /* Check whether we need to add space at the tail */ if (unlikely(skb_tailroom(skb) < len)) { if (likely(len < PKT_LEN_WHEN_EXTENDING)) addlen = PKT_LEN_WHEN_EXTENDING; else addlen = len; } /* Check whether we need to change the SKB before writing to the tail */ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) { /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } } /* All set to put the last SKB and optionally write data there. */ to = pskb_put(skb, lastskb, len); if (likely(data)) memcpy(to, data, len); return 0; } inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data) { return cfpkt_add_body(pkt, &data, 1); } int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; const u8 *data = data2; int ret; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { PKT_ERROR(pkt, "cow failed\n"); return ret; } to = skb_push(skb, len); memcpy(to, data, len); return 0; } EXPORT_SYMBOL(cfpkt_add_head); inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) { return cfpkt_add_body(pkt, data, len); } inline u16 cfpkt_getlen(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len; } int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16, void *, u16), u16 data) { /* * Don't care about the performance hit of linearizing, * Checksum should not be used on high-speed interfaces anyway. */ if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); } int cfpkt_setlen(struct cfpkt *pkt, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (unlikely(is_erronous(pkt))) return -EPROTO; if (likely(len <= skb->len)) { if (unlikely(skb->data_len)) ___pskb_trim(skb, len); else skb_trim(skb, len); return cfpkt_getlen(pkt); } /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt, u16 expectlen) { struct sk_buff *dst = pkt_to_skb(dstpkt); struct sk_buff *add = pkt_to_skb(addpkt); u16 addlen = skb_headlen(add); u16 neededtailspace; struct sk_buff *tmp; u16 dstlen; u16 createlen; if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { return dstpkt; } if (expectlen > addlen) neededtailspace = expectlen; else neededtailspace = addlen; if (dst->tail + neededtailspace > dst->end) { /* Create a dumplicate of 'dst' with more tail space */ struct cfpkt *tmppkt; dstlen = skb_headlen(dst); createlen = dstlen + neededtailspace; tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX); if (tmppkt == NULL) return NULL; tmp = pkt_to_skb(tmppkt); skb_put_data(tmp, dst->data, dstlen); cfpkt_destroy(dstpkt); dst = tmp; } skb_put_data(dst, add->data, skb_headlen(add)); cfpkt_destroy(addpkt); return skb_to_pkt(dst); } struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) { struct sk_buff *skb2; struct sk_buff *skb = pkt_to_skb(pkt); struct cfpkt *tmppkt; u8 *split = skb->data + pos; u16 len2nd = skb_tail_pointer(skb) - split; if (unlikely(is_erronous(pkt))) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } /* Create a new packet for the second part of the data */ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX, PKT_PREFIX); if (tmppkt == NULL) return NULL; skb2 = pkt_to_skb(tmppkt); if (skb2 == NULL) return NULL; skb_put_data(skb2, split, len2nd); /* Reduce the length of the original packet */ skb_trim(skb, pos); skb2->priority = skb->priority; return skb_to_pkt(skb2); } bool cfpkt_erroneous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) { return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; } EXPORT_SYMBOL(cfpkt_info); void cfpkt_set_prio(struct cfpkt *pkt, int prio) { pkt_to_skb(pkt)->priority = prio; } EXPORT_SYMBOL(cfpkt_set_prio); |
| 915 915 2799 2975 2975 2602 2975 915 2951 2809 110 915 2952 2952 2799 2527 248 2495 53 2483 2952 25 2951 915 915 915 915 2952 25 915 1148 6334 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment in folio_lru_refs() */ return order_base_2(refs + 1); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); bool workingset = flags & BIT(PG_workingset); /* * Return the number of accesses beyond PG_referenced, i.e., N-1 if the * total number of accesses is N>1, since N=0,1 both map to the first * tier. lru_tier_from_refs() will account for this off-by-one. Also see * the comment on MAX_NR_TIERS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* * There are four common cases for this page: * 1. If it's hot, i.e., freshly faulted in, add it to the youngest * generation, and it's protected over the rest below. * 2. If it can't be evicted immediately, i.e., a dirty page pending * writeback, add it to the second youngest generation. * 3. If it should be evicted first, e.g., cold and clean from * folio_rotate_reclaimable(), add it to the oldest generation. * 4. Everything else falls between 2 & 3 above and is added to the * second oldest generation if it's considered inactive, or the * oldest generation otherwise. See lru_gen_is_active(). */ if (folio_test_active(folio)) seq = lrugen->max_seq; else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) seq = lrugen->max_seq - 1; else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) seq = lrugen->min_seq[type]; else seq = lrugen->min_seq[type] + 1; gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & PTE_MARKER_POISONED; /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. */ static inline void pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); #endif } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif |
| 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_softif_vlan_release); } #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ |
| 1117 1116 1117 1117 1117 1115 1116 216 216 76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/dir.c - sysfs core and dir operation implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt #include <linux/fs.h> #include <linux/kobject.h> #include <linux/slab.h> #include "sysfs.h" DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) kernfs_path(parent, buf, PATH_MAX); pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name); dump_stack(); kfree(buf); } /** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { struct kernfs_node *parent, *kn; kuid_t uid; kgid_t gid; if (WARN_ON(!kobj)) return -EINVAL; if (kobj->parent) parent = kobj->parent->sd; else parent = sysfs_root_kn; if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn); } kobj->sd = kn; return 0; } /** * sysfs_remove_dir - remove an object's directory. * @kobj: object. * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ void sysfs_remove_dir(struct kobject *kobj) { struct kernfs_node *kn = kobj->sd; /* * In general, kboject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no * control over removal. @kobj->sd may be removed anytime * and symlink code may end up dereferencing an already freed node. * * sysfs_symlink_target_lock synchronizes @kobj->sd * disassociation against symlink operations so that symlink code * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); if (kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); kernfs_remove(kn); } } int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { struct kernfs_node *parent; int ret; parent = kernfs_get_parent(kobj->sd); ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); kernfs_put(parent); return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } /** * sysfs_create_mount_point - create an always empty directory * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to add */ int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *kn, *parent = parent_kobj->sd; kn = kernfs_create_empty_dir(parent, name); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } return 0; } EXPORT_SYMBOL_GPL(sysfs_create_mount_point); /** * sysfs_remove_mount_point - remove an always empty directory. * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to remove * */ void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *parent = parent_kobj->sd; kernfs_remove_by_name_ns(parent, name, NULL); } EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); |
| 1343 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) { unsigned long len = vma->vm_end - vma->vm_start; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap = secretmem_mmap, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &qname, NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init); |
| 1 539 178 3064 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | // SPDX-License-Identifier: GPL-2.0 /* * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC */ #include <crypto/hash.h> #include <crypto/sha2.h> #include "fscrypt_private.h" /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses * SHA-512 because it is well-established, secure, and reasonably efficient. * * HKDF-SHA256 was also considered, as its 256-bit security strength would be * sufficient here. A 512-bit security strength is "nice to have", though. * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the * common case of deriving an AES-256-XTS key (512 bits), that can result in * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* * HKDF consists of two steps: * * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from * the input keying material and optional salt. * 2. HKDF-Expand: expand the pseudorandom key into output keying material of * any length, parameterized by an application-specific info string. * * HKDF-Extract can be skipped if the input is already a pseudorandom key of * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take * shorter keys, and we don't want to force users of those modes to provide * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No * salt is used, since fscrypt master keys should already be pseudorandom and * there's no way to persist a random salt per master key from kernel mode. */ /* HKDF-Extract (RFC 5869 section 2.2), unsalted */ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. * * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many * times without having to recompute HKDF-Extract each time. */ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; u8 prk[HKDF_HASHLEN]; int err; hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); if (IS_ERR(hmac_tfm)) { fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", PTR_ERR(hmac_tfm)); return PTR_ERR(hmac_tfm); } if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { err = -EINVAL; goto err_free_tfm; } err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); if (err) goto err_free_tfm; err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); if (err) goto err_free_tfm; hkdf->hmac_tfm = hmac_tfm; goto out; err_free_tfm: crypto_free_shash(hmac_tfm); out: memzero_explicit(prk, sizeof(prk)); return err; } /* * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. * * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); u8 prefix[9]; unsigned int i; int err; const u8 *prev = NULL; u8 counter = 1; u8 tmp[HKDF_HASHLEN]; if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) return -EINVAL; desc->tfm = hkdf->hmac_tfm; memcpy(prefix, "fscrypt\0", 8); prefix[8] = context; for (i = 0; i < okmlen; i += HKDF_HASHLEN) { err = crypto_shash_init(desc); if (err) goto out; if (prev) { err = crypto_shash_update(desc, prev, HKDF_HASHLEN); if (err) goto out; } err = crypto_shash_update(desc, prefix, sizeof(prefix)); if (err) goto out; err = crypto_shash_update(desc, info, infolen); if (err) goto out; BUILD_BUG_ON(sizeof(counter) != 1); if (okmlen - i < HKDF_HASHLEN) { err = crypto_shash_finup(desc, &counter, 1, tmp); if (err) goto out; memcpy(&okm[i], tmp, okmlen - i); memzero_explicit(tmp, sizeof(tmp)); } else { err = crypto_shash_finup(desc, &counter, 1, &okm[i]); if (err) goto out; } counter++; prev = &okm[i]; } err = 0; out: if (unlikely(err)) memzero_explicit(okm, okmlen); /* so caller doesn't need to */ shash_desc_zero(desc); return err; } void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) { crypto_free_shash(hkdf->hmac_tfm); } |
| 2 537 537 10 10 2958 3293 3296 3295 3305 333 10 333 2944 538 537 2943 2943 536 2939 2944 2186 2187 803 32 2094 2094 489 346 362 278 340 2 278 278 278 278 67 67 67 67 1242 1242 1243 1243 1243 1243 340 340 1243 1242 2753 1206 938 46 938 543 543 543 2 543 9 7 7 2 2 2 2 2 2 2 2 2 505 1224 959 959 488 879 1224 2287 731 2763 2763 731 720 2762 2761 2762 2757 2751 381 505 2762 490 490 490 487 487 487 487 487 487 487 487 490 490 487 2187 2185 814 814 32 32 5 803 803 2187 2187 2187 2185 814 814 32 32 5 803 803 19 802 326 47 47 47 835 835 834 835 835 835 38 327 826 346 346 2364 2363 2364 499 5 5 5 1024 1002 1001 1001 1679 953 377 367 1678 1678 491 491 159 159 159 159 1679 938 1676 814 803 19 32 47 2363 490 1228 1136 2363 3 1 703 457 703 536 146 619 2 2354 490 490 487 487 2355 2355 2106 1064 1049 29 1064 272 1422 2363 2363 2363 6 5 2 3 2 3 3 6 3 3 3 3 21 4 18 18 18 2227 12 2227 1 2227 62 61 62 62 62 62 21 21 21 2987 312 312 311 312 312 312 312 312 312 312 1114 64 1114 1100 1114 1114 337 312 312 9 9 7 312 309 312 10 312 311 7 12 312 312 311 11 326 1112 62 66 66 64 1117 1112 1117 18 1 1 17 17 16 8 8 9 1116 17 1102 1100 66 1114 1112 1112 66 66 64 64 64 64 64 64 64 64 1114 1112 1112 1084 1084 1111 1112 7 405 405 405 405 405 41 6 41 385 373 385 385 385 4 4 51 24 12 24 24 13 24 27 51 51 12 39 51 51 7 7 7 7 7 2 2 2 2 5 2 2 6 6 6 6 46 46 46 34 34 46 46 46 46 15 3 15 12 12 15 15 336 336 34 34 34 327 46 282 37 245 238 7 40 77 46 336 2737 2735 2477 1193 1191 520 314 15 299 2558 145 2225 2185 2186 2187 2187 2187 398 1933 398 1933 2187 2145 44 2187 1014 2187 1014 2187 2177 2177 964 2177 11 11 11 2177 2177 1005 1407 1406 27 2177 2735 2735 2735 1109 1108 1102 1097 452 452 51 452 452 325 452 452 114 452 452 452 452 452 165 79 80 80 80 80 80 79 79 79 79 70 70 49 48 48 40 39 39 39 39 39 2 376 498 498 49 454 454 128 454 376 375 375 375 15 1 21 46 46 46 29 46 46 29 46 40 29 40 40 18 18 18 18 12 12 12 2 2 2 24 80 98 97 96 5 98 91 2 2 91 90 2 91 9 9 1 1 9 7 6 6 358 358 358 357 357 357 357 357 24 24 24 10 356 356 356 347 356 353 356 356 356 356 356 356 356 347 353 357 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> */ /* * Extents support for EXT4 * * TODO: * - ext4*_error() should be used in some situations * - analyze all BUG()/BUG_ON(), use -EIO where appropriate * - smart tree reduction */ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" #include <trace/events/ext4.h> /* * used by extent splitting. */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } static int ext4_extent_block_csum_verify(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); if (et->et_checksum != ext4_extent_block_csum(inode, eh)) return 0; return 1; } static void ext4_extent_block_csum_set(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); et->et_checksum = ext4_extent_block_csum(inode, eh); } static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; if (!path) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) { brelse(path->p_bh); path->p_bh = NULL; } } void ext4_free_ext_path(struct ext4_ext_path *path) { ext4_ext_drop_refs(path); kfree(path); } /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem * when restarting transaction and gets it after transaction is restarted. * * The function returns 0 on success, 1 if transaction had to be restarted, * and < 0 in case of fatal error. */ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); return ret; } /* * could return: * - EROFS * - ENOMEM */ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err = 0; if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, path->p_bh, EXT4_JTR_NONE); /* * The extent buffer's verified bit will be set again in * __ext4_ext_dirty(). We could leave an inconsistent * buffer if the extents updating procudure break off du * to some error happens, force to check it again. */ if (!err) clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ return err; } /* * could return: * - EROFS * - ENOMEM * - EIO */ static int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); /* Extents updating done, re-set verified flag */ if (!err) set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); } return err; } #define ext4_ext_dirty(handle, inode, path) \ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { if (path) { int depth = path->p_depth; struct ext4_extent *ex; /* * Try to predict block placement assuming that we are * filling in a file which will eventually be * non-sparse --- i.e., in the case of libbfd writing * an ELF object sections out-of-order but in a way * the eventually results in a contiguous object or * executable file, or some database extending a table * space file. However, this is actually somewhat * non-ideal if we are writing a sparse file such as * qemu or KVM writing a raw image file that is going * to stay fairly sparse, since it will end up * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; if (ex) { ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); if (block > ext_block) return ext_pblk + (block - ext_block); else return ext_pblk - (ext_block - block); } /* it looks like index is empty; * try to find starting block from index itself */ if (path[depth].p_bh) return path[depth].p_bh->b_blocknr; } /* OK. use inode's group */ return ext4_inode_to_goal_block(inode); } /* * Allocation for a meta data block */ static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, err); return newblock; } static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 6) size = 6; #endif return size; } static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 5) size = 5; #endif return size; } static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 3) size = 3; #endif return size; } static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 4) size = 4; #endif return size; } static inline int ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t lblk, int nofail) { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } static int ext4_ext_max_entries(struct inode *inode, int depth) { int max; if (depth == ext_depth(inode)) { if (depth == 0) max = ext4_ext_space_root(inode, 1); else max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) max = ext4_ext_space_block(inode, 1); else max = ext4_ext_space_block_idx(inode, 1); } return max; } static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); /* * We allow neither: * - zero length * - overflow/wrap-around */ if (lblock + len <= lblock) return 0; return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, ext4_lblk_t lblk, ext4_fsblk_t *pblk, int depth) { unsigned short entries; ext4_lblk_t lblock = 0; ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; entries = le16_to_cpu(eh->eh_entries); if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); /* * The logical block in the first entry should equal to * the number in the index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext->ee_block)) return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); /* * The logical block in the first entry should equal to * the number in the parent index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext_idx->ei_block)) return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; cur = lblock + 1; } } return 1; } static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { error_msg = "unexpected eh_depth"; goto corrupted; } if (unlikely(eh->eh_max == 0)) { error_msg = "invalid eh_max"; goto corrupted; } max = ext4_ext_max_entries(inode, depth); if (unlikely(le16_to_cpu(eh->eh_max) > max)) { error_msg = "too large eh_max"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { error_msg = "invalid eh_entries"; goto corrupted; } if (unlikely((eh->eh_entries == 0) && (depth > 0))) { error_msg = "eh_entries is 0 but eh_depth is > 0"; goto corrupted; } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } if (unlikely(depth > 32)) { error_msg = "too large eh_depth"; goto corrupted; } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; err = -EFSBADCRC; goto corrupted; } return 0; corrupted: ext4_error_inode_err(inode, function, line, 0, -err, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", (unsigned long long) pblk, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } static void ext4_cache_extents(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); ext4_lblk_t prev = 0; int i; for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); int len = ext4_ext_get_actual_len(ex); if (prev && (prev != lblk)) ext4_es_cache_extent(inode, prev, lblk - prev, ~0, EXTENT_STATUS_HOLE); if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); prev = lblk + len; } } static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_idx *idx, int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); err = ext4_read_bh(bh, 0, NULL); if (err < 0) goto errout; } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); ext4_cache_extents(inode, eh); } return bh; errout: put_bh(bh); return ERR_PTR(err); } #define read_extent_tree_block(inode, idx, depth, flags) \ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* * This function is called to cache a file's extent information in the * extent status tree */ int ext4_ext_precache(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_ext_path *path = NULL; struct buffer_head *bh; int i = 0, depth, ret = 0; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ down_read(&ei->i_data_sem); depth = ext_depth(inode); /* Don't cache anything if there are no external extent blocks */ if (!depth) { up_read(&ei->i_data_sem); return ret; } path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) goto out; path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); while (i >= 0) { /* * If this is a leaf block or we've reached the end of * the index block, go up */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { ret = PTR_ERR(bh); break; } i++; path[i].p_bh = bh; path[i].p_hdr = ext_block_hdr(bh); path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); } ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); ext4_free_ext_path(path); return ret; } #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else ext_debug(inode, " []"); } ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) { int depth = ext_depth(inode); struct ext4_extent_header *eh; struct ext4_extent *ex; int i; if (!path) return; eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext4_fsblk_t newblock, int level) { int depth = ext_depth(inode); struct ext4_extent *ex; if (depth != level) { struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { ext_debug(inode, "%d: move %d:%llu in new index %llu\n", level, le32_to_cpu(idx->ei_block), ext4_idx_pblock(idx), newblock); idx++; } return; } ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; } } #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_move(inode, path, newblock, level) #endif /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), r, le32_to_cpu(r->ei_block)); if (block < le32_to_cpu(m->ei_block)) r = m - 1; else l = m + 1; } path->p_idx = l - 1; ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { struct ext4_extent_idx *chix, *ix; int k; chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } BUG_ON(k && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)); if (block < le32_to_cpu(ix->ei_block)) break; chix = ix; } BUG_ON(chix != path->p_idx); } #endif } /* * ext4_ext_binsearch: * binary search for closest extent of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; if (eh->eh_entries == 0) { /* * this leaf is empty: * we get such a leaf in split/add case */ return; } ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), r, le32_to_cpu(r->ee_block)); if (block < le32_to_cpu(m->ee_block)) r = m - 1; else l = m + 1; } path->p_ext = l - 1; ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { struct ext4_extent *chex, *ex; int k; chex = ex = EXT_FIRST_EXTENT(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { BUG_ON(k && le32_to_cpu(ex->ee_block) <= le32_to_cpu(ex[-1].ee_block)); if (block < le32_to_cpu(ex->ee_block)) break; chex = ex; } BUG_ON(chex != path->p_ext); } #endif } void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; eh = ext_inode_hdr(inode); eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", depth); ret = -EFSCORRUPTED; goto err; } if (path) { ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); *orig_path = path = NULL; } } if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; i = depth; if (!(flags & EXT4_EX_NOCACHE) && depth == 0) ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } eh = ext_block_hdr(bh); ppos++; path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } path[ppos].p_depth = i; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); return path; err: ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); } /* * ext4_ext_insert_index: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; err = ext4_ext_get_access(handle, inode, curp); if (err) return err; if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) >= le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ ext_debug(inode, "insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ ext_debug(inode, "insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EFSCORRUPTED; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); return err; } /* * ext4_ext_split: * inserts new subtree into the path, using free index entry * at depth @at: * - allocates all needed blocks (new leaf and all intermediate index blocks) * - makes decision where to split * - moves remaining extents and index entries (right to the split point) * into the newly allocated blocks * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, unsigned int flags, struct ext4_ext_path *path, struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ /* if current leaf will be split, then we should use * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } /* * If error occurs, then we break processing * and mark filesystem read-only. index won't * be inserted and tree will be in consistent * state. Next mount will repair buffers too. */ /* * Get array to track all allocated blocks. * We need this to handle errors and free blocks * upon them. */ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; } /* initialize new leaf */ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = 0; neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max)) { EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; ext4_ext_show_move(inode, path, newblock, depth); if (m) { struct ext4_extent *ex; ex = EXT_FIRST_EXTENT(neh); memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old leaf */ if (m) { err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; } /* create intermediate indexes */ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); err = -EFSCORRUPTED; goto cleanup; } if (k) ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; while (k--) { oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old index */ if (m) { err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; } i--; } /* insert new index */ err = ext4_ext_insert_index(handle, inode, path + at, le32_to_cpu(border), newblock); cleanup: if (bh) { if (buffer_locked(bh)) unlock_buffer(bh); brelse(bh); } if (err) { /* free all allocated blocks in error case */ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); return err; } /* * ext4_ext_grow_indepth: * implements tree growing procedure: * - allocates new block * - moves top-level data (index block or leaf) into the new block * - initializes new top-level, creating index that points to the * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); if (goal > le32_to_cpu(es->s_first_data_block)) { flags |= EXT4_MB_HINT_TRY_GOAL; goal--; } else goal = ext4_inode_to_goal_block(inode); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, &err); if (newblock == 0) return err; bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; } ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); /* zero out unused area in the extent block */ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); neh->eh_entries = cpu_to_le16(1); ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); if (neh->eh_depth == 0) { /* Root extent block becomes index block */ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); return err; } /* * ext4_ext_create_new_leaf: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, struct ext4_ext_path **ppath, struct ext4_extent *newext) { struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; repeat: i = depth = ext_depth(inode); /* walk up to the tree and look for free index entry */ curp = path + depth; while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { i--; curp--; } /* we use already allocated block for index block, * so subsequent data blocks should be contiguous */ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } /* * only first (depth 0 -> 1) produces free space; * in all other cases we have to split the grown tree */ depth = ext_depth(inode); if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { /* now we need to split */ goto repeat; } } out: return err; } /* * search the closest allocated block to the left for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the smallest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth, ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } } return 0; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } /* * Search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys. * If not exists, return 0 and @phys is set to 0. We will return * 1 which means we found an allocated block and ret_ex is valid. * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, struct ext4_extent *ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); return -EFSCORRUPTED; } } goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; goto found_extent; } /* go up and search for index to the right */ while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) goto got_index; } /* we've gone up to the root and found no index to the right */ return 0; got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); put_bh(bh); } bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); if (ret_ex) *ret_ex = *ex; if (bh) put_bh(bh); return 1; } /* * ext4_ext_next_allocated_block: * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. */ ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) return EXT_MAX_BLOCKS; while (depth >= 0) { struct ext4_ext_path *p = &path[depth]; if (depth == path->p_depth) { /* leaf */ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; /* zero-tree has no leaf blocks at all */ if (depth == 0) return EXT_MAX_BLOCKS; /* go to index block */ depth--; while (depth >= 0) { if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return (ext4_lblk_t) le32_to_cpu(path[depth].p_idx[1].ei_block); depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_correct_indexes: * if leaf gets modified and modified extent is first in the leaf, * then we have to correct all indexes above. * TODO: do we need to correct tree in all cases? */ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { struct ext4_extent_header *eh; int depth = ext_depth(inode); struct ext4_extent *ex; __le32 border; int k, err = 0; eh = path[depth].p_hdr; ex = path[depth].p_ext; if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); return -EFSCORRUPTED; } if (depth == 0) { /* there is no tree at all */ return 0; } if (ex != EXT_FIRST_EXTENT(eh)) { /* we correct tree if first leaf got modified only */ return 0; } /* * TODO: we need correction if border is smaller than current one */ k = depth - 1; border = path[depth].p_ext->ee_block; err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; while (k--) { /* change all left-side indexes */ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) break; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) break; } return err; } static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; if (ext4_ext_is_unwritten(ex1) && ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; #endif if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } /* * This function tries to merge the "ex" extent to the next extent in the tree. * It always tries to merge towards right. If you want to merge towards * left, pass "ex - 1" as argument instead of "ex". * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; while (ex < EXT_LAST_EXTENT(eh)) { if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); if (unwritten) ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; } /* * This function does a very simple check to see if we can collapse * an extent tree with a single extent tree leaf block into the inode. */ static void ext4_ext_try_to_merge_up(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { size_t s; unsigned max_root = ext4_ext_space_root(inode, 0); ext4_fsblk_t blk; if ((path[0].p_depth != 1) || (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) return; /* * We need to modify the block allocation bitmap and the block * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ if (ext4_journal_extend(handle, 2, ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* * Copy the extent data up to the inode */ blk = ext4_idx_pblock(path[0].p_idx); s = le16_to_cpu(path[1].p_hdr->eh_entries) * sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* * This function tries to merge the @ex extent to neighbours in the tree, then * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; if (ex > EXT_FIRST_EXTENT(eh)) merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) (void) ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); } /* * check if a portion of the "newext" extent overlaps with an * existing extent. * * If there is an overlap discovered, it updates the length of the newext * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; unsigned int ret = 0; b1 = le32_to_cpu(newext->ee_block); len1 = ext4_ext_get_actual_len(newext); depth = ext_depth(inode); if (!path[depth].p_ext) goto out; b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } /* check for overlap */ if (b1 + len1 > b2) { newext->ee_len = cpu_to_le16(b2 - b1); ret = 1; } out: return ret; } /* * ext4_ext_insert_extent: * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; int mb_flags = 0, unwritten; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* try to insert block into found extent and return */ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ if (ex < EXT_LAST_EXTENT(eh) && (le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) < le32_to_cpu(newext->ee_block))) { ex += 1; goto prepend; } else if ((ex > EXT_FIRST_EXTENT(eh)) && (le32_to_cpu(newext->ee_block) + ext4_ext_get_actual_len(newext) < le32_to_cpu(ex->ee_block))) ex -= 1; /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } } depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) goto has_space; /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); eh = path[depth].p_hdr; has_space: nearex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; if (!nearex) { /* there is no extent in this leaf, create first one */ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, len * sizeof(struct ext4_extent)); } } le16_add_cpu(&eh->eh_entries, 1); path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto cleanup; err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: ext4_free_ext_path(npath); return err; } static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) { ext4_lblk_t next, end = block + num - 1; struct extent_status es; unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; unsigned int flags; int err; while (block <= end) { next = 0; flags = 0; if (!ext4_es_lookup_extent(inode, block, &next, &es)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (ext4_es_is_delayed(&es)) flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); if (ext4_es_is_hole(&es)) flags |= EXT4_FIEMAP_EXTENT_HOLE; if (next == 0) flags |= FIEMAP_EXTENT_LAST; if (flags & (FIEMAP_EXTENT_DELALLOC| EXT4_FIEMAP_EXTENT_HOLE)) es.es_pblk = 0; else es.es_pblk = ext4_es_pblock(&es); err = fiemap_fill_next_extent(fieinfo, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, (__u64)es.es_len << blksize_bits, flags); if (next == 0) break; block = next; if (err < 0) return err; if (err == 1) return 0; } return 0; } /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole * * Determine hole length (and start if easily possible) around given logical * block. We don't try too hard to find the beginning of the hole but @path * actually points to extent before @lblk, we provide it. * * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ *lblk = 0; len = EXT_MAX_BLOCKS; } else if (*lblk < le32_to_cpu(ex->ee_block)) { len = le32_to_cpu(ex->ee_block) - *lblk; } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); BUG_ON(next == *lblk); len = next - *lblk; } else { BUG(); } return len; } /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into * and cache this gap */ static void ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, ext4_lblk_t hole_len) { struct extent_status es; ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) return; hole_len = min(es.es_lblk - hole_start, hole_len); } ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); ext4_es_insert_extent(inode, hole_start, hole_len, ~0, EXTENT_STATUS_HOLE); } /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ depth--; path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) return err; if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; len *= sizeof(struct ext4_extent_idx); memmove(path->p_idx, path->p_idx + 1, len); } le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); while (--depth >= 0) { if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) break; path--; err = ext4_ext_get_access(handle, inode, path); if (err) break; path->p_idx->ei_block = (path+1)->p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path); if (err) break; } return err; } /* * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. * When pass the actual path, the caller should calculate credits * under i_data_sem. */ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { int depth = ext_depth(inode); int ret = 0; /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) < le16_to_cpu(path[depth].p_hdr->eh_max)) { /* * There are some space in the leaf tree, no * need to account for leaf block credit * * bitmaps and block group descriptor blocks * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } } return ext4_chunk_trans_blocks(inode, nrblocks); } /* * How many index/leaf blocks need to change/allocate to add @extents extents? * * If we add a single extent, then in the worse case, each tree level * index/leaf need to be changed in case of the tree split. * * If more extents are inserted, they could cause the whole tree split more * than once, but this is really rare. */ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; depth = ext_depth(inode); if (extents <= 1) index = depth * 2; else index = depth * 3; return index; } static inline int get_default_free_blocks_flags(struct inode *inode) { if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; return 0; } /* * ext4_rereserve_cluster - increment the reserved cluster count when * freeing a cluster with a pending reservation * * @inode - file containing the cluster * @lblk - logical block in cluster to be reserved * * Increments the reserved cluster count and adjusts quota in a bigalloc * file system when freeing a partial cluster containing at least one * delayed and unwritten block. A partial cluster meeting that * requirement will have a pending reservation. If so, the * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to * defer reserved and allocated space accounting to a subsequent call * to this function. */ static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks++; percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); spin_unlock(&ei->i_block_reservation_lock); percpu_counter_add(&sbi->s_freeclusters_counter, 1); ext4_remove_pending(inode, lblk); } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t last_pblk, pblk; ext4_lblk_t num; int flags; /* only extent tail removal is allowed */ if (from < le32_to_cpu(ex->ee_block) || to != le32_to_cpu(ex->ee_block) + ee_len - 1) { ext4_error(sbi->s_sb, "strange request: removal(2) %u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } #ifdef EXTENTS_STATS spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; if (ee_len < sbi->s_ext_min) sbi->s_ext_min = ee_len; if (ee_len > sbi->s_ext_max) sbi->s_ext_max = ee_len; if (ext_depth(inode) > sbi->s_depth_max) sbi->s_depth_max = ext_depth(inode); spin_unlock(&sbi->s_ext_stats_lock); #endif trace_ext4_remove_blocks(inode, ex, from, to, partial); /* * if we have a partial cluster, and it's different from the * cluster of the last block in the extent, we free it */ last_pblk = ext4_ext_pblock(ex) + ee_len - 1; if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, last_pblk)) { if (partial->state == tofree) { flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; /* * We free the partial cluster at the end of the extent (if any), * unless the cluster is used by another extent (partial_cluster * state is nofree). If a partial cluster exists here, it must be * shared with the last block in the extent. */ flags = get_default_free_blocks_flags(inode); /* partial, left end cluster aligned, right end unaligned */ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && (EXT4_LBLK_CMASK(sbi, to) >= from) && (partial->state != nofree)) { if (ext4_is_pending(inode, to)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, to); partial->state = initial; flags = get_default_free_blocks_flags(inode); } flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* reset the partial cluster if we've freed past it */ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) partial->state = initial; /* * If we've freed the entire extent but the beginning is not left * cluster aligned and is not marked as ineligible for freeing we * record the partial cluster at the beginning of the extent. It * wasn't freed by the preceding ext4_free_blocks() call, and we * need to look farther to the left to determine if it's to be freed * (not shared with another extent). Else, reset the partial * cluster - we're either done freeing or the beginning of the * extent is left cluster aligned. */ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { if (partial->state == initial) { partial->pclu = EXT4_B2C(sbi, pblk); partial->lblk = from; partial->state = tofree; } } else { partial->state = initial; } return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents * has been released from it. However, if this value is * negative, it's a cluster just to the right of the * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; if (!ex) ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { if (ext4_ext_is_unwritten(ex)) unwritten = 1; else unwritten = 0; ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; a = max(ex_ee_block, start); b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, * so note that its first cluster is in use to avoid * freeing it when removing blocks. Eventually, the * right edge of the truncated/punched region will * be just to the left. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); partial->pclu = EXT4_B2C(sbi, pblk); partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { EXT4_ERROR_INODE(inode, "can not handle truncate %u:%u " "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ num = 0; } /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups plus ex_ee_len/blocks_per_block_group for * the worst case */ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); /* * We may end up freeing some index blocks and data from the * punched range. Note that partial clusters are accounted for * by ext4_free_data_revoke_credits(). */ revoke_credits = ext4_free_metadata_revoke_credits(inode->i_sb, ext_depth(inode)) + ext4_free_data_revoke_credits(inode, b - a + 1); err = ext4_datasem_ensure_credits(handle, inode, credits, credits, revoke_credits); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); ex->ee_len = cpu_to_le16(num); /* * Do not mark unwritten if all the blocks in the * extent have been removed. */ if (unwritten && num) ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf */ if (num == 0) { if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that * we dont have blank extents in the middle */ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent)); /* Now get rid of the one at the end */ memset(EXT_LAST_EXTENT(eh), 0, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; if (partial->pclu != EXT4_B2C(sbi, pblk)) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; } /* * ext4_ext_more_to_rm: * returns 1 if current index has to be freed (even partial) */ static int ext4_ext_more_to_rm(struct ext4_ext_path *path) { BUG_ON(path->p_idx == NULL); if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) return 0; /* * if truncate on deeper level happened, it wasn't partial, * so we have to consider current index for truncation */ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) return 0; return 1; } int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; partial.pclu = 0; partial.lblk = 0; partial.state = initial; ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, depth + 1, ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); again: trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that * is the case, we are going to punch a hole inside the extent tree * so we have to check whether we need to split the extent covering * the last block to remove so we can easily remove the part of it * in ext4_ext_rm_leaf(). */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; ext4_lblk_t ee_block, ex_end, lblk; ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); } depth = ext_depth(inode); /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { if (depth) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; } goto out; } ee_block = le32_to_cpu(ex->ee_block); ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split * the extent at 'end' block so we can easily remove the * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ if (end >= ee_block && end < ex_end) { /* * If we're going to split the extent, note that * the cluster containing the block after 'end' is * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ err = ext4_force_split_extent_at(handle, inode, &path, end + 1, 1); if (err < 0) goto out; } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* * If we're punching, there's an extent to the right. * If the partial cluster hasn't been set, set it to * that extent's first cluster and its state to nofree * so it won't be freed should it contain blocks to be * removed. If it's already set (tofree/nofree), we're * retrying and keep the original partial cluster info * so a cluster marked tofree as a result of earlier * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, NULL); if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } } } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ depth = ext_depth(inode); if (path) { int k = i = depth; while (--k > 0) path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EFSCORRUPTED; goto out; } } err = 0; while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } /* this is index block */ if (!path[i].p_hdr) { ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } if (!path[i].p_idx) { /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { /* we were already here, see at next index */ path[i].p_idx--; } ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, depth - i - 1, EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; /* save actual number of indexes since this * number is changed at the next iteration */ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); i++; } else { /* we finished processing this index, go up */ if (path[i].p_hdr->eh_entries == 0 && i > 0) { /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; ext_debug(inode, "return to level %d\n", i); } } trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, path->p_hdr->eh_entries); /* * if there's a partial cluster and we have removed the first extent * in the file, then we also free the partial cluster, if any */ if (partial.state == tofree && err == 0) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial.lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial.pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial.lblk); partial.state = initial; } /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* * truncate to zero freed all the tree, * so we need to correct eh_depth */ err = ext4_ext_get_access(handle, inode, path); if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); return err; } /* * called at mount time */ void ext4_ext_init(struct super_block *sb) { /* * possible initialization would be here */ if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST ", aggressive tests" #endif #ifdef CHECK_BINSEARCH ", check binsearch" #endif #ifdef EXTENTS_STATS ", stats" #endif "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; EXT4_SB(sb)->s_ext_max = 0; #endif } } /* * called at umount time */ void ext4_ext_release(struct super_block *sb) { if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { struct ext4_sb_info *sbi = EXT4_SB(sb); printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", sbi->s_ext_blocks, sbi->s_ext_extents, sbi->s_ext_blocks / sbi->s_ext_extents); printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); } #endif } static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN); } /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, ee_len); } /* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * * return 0 on success. */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (split == ee_block) { /* * case b: block @split is the block that the extent begins with * then we just change the state of the extent, and splitting * is not needed. */ if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); if (split_flag & EXT4_EXT_MARK_UNWRIT1) ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more * after ext4_ext_insert_extent() returns, */ err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto fix_extent_len; ex2 = &newex; ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } if (!err) { /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more * damage. */ goto out; } } fix_extent_len: ex->ee_len = orig_ex.ee_len; /* * Ignore ext4_ext_dirty return value since we are already in error path * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; out: ext4_ext_show_leaf(inode, path); return err; } /* * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent * c> Splits in three extents: Somone is splitting in middle of the extent * */ static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; int unwritten; int split_flag1, flags1; int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; } else { allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; } ext4_ext_show_leaf(inode, path); out: return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * * Post-conditions on success: * - the returned value is the number of blocks beyond map->l_lblk * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map_len) eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); zero_ex1.ee_len = 0; zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) * followed by append writes. * * Limitations of the current logic: * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; unsigned int prev_len; abut_ex = ex - 1; prev_lblk = le32_to_cpu(abut_ex->ee_block); prev_len = ext4_ext_get_actual_len(abut_ex); prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of ex by 'map_len' blocks */ ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ ex < EXT_LAST_EXTENT(eh)) { /*L2*/ /* See if we can merge right */ ext4_lblk_t next_lblk; ext4_fsblk_t next_pblk, ee_pblk; unsigned int next_len; abut_ex = ex + 1; next_lblk = le32_to_cpu(abut_ex->ee_block); next_len = ext4_ext_get_actual_len(abut_ex); next_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of abut_ex by 'map_len' blocks */ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } if (allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; goto out; } else allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); /* * five cases: * 1. split the extent into three extents. * 2. split the extent into two extents, zeroout the head of the first * extent. * 3. split the extent into two extents, zeroout the tail of the second * extent. * 4. split the extent into two extents with out zeroout. * 5. no splitting needed, just possibly zeroout the head and / or the * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = cpu_to_le16(allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { /* case 2 or 5 */ if (split_map.m_lblk != ee_block) { zero_ex2.ee_block = ex->ee_block; zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; allocated = map->m_len; } } fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { ext4_zeroout_es(inode, &zero_ex1); ext4_zeroout_es(inode, &zero_ex2); } return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an unwritten extent. * * Writing to an unwritten extent may result in splitting the unwritten * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after * the unwritten extent split. To prevent ENOSPC occur at the IO * complete, we need to split the unwritten extent before DIO submit * the IO. The unwritten extent called at this time will be split * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still * have some extent state machine issues left. So extent_split is still * required. * TODO: Once all related issues will be fixed this situation should be * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; } static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, unsigned int *allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; /* * Make sure that the extent is no bigger than we support with * unwritten extent */ if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) return err; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); map->m_flags |= EXT4_MAP_UNWRITTEN; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; return 0; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); if (ret < 0) { err = ret; goto out2; } /* * shouldn't get a 0 return when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (err < 0) goto out2; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * We have blocks reserved already. We * return allocated blocks so that delalloc * won't do block reservation for us. But * the buffer head will be unmapped so that * a read from the block returns 0s. */ map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret < 0) { err = ret; goto out2; } ext4_update_inode_fsync_trans(handle, inode, 1); /* * shouldn't get a 0 return when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } out: allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } /* * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation * * This function is called by ext4_ext_map_blocks() after we failed to * find blocks that were already in the inode's extent tree. Hence, * we know that the beginning of the requested region cannot overlap * the extent from the inode's extent tree. There are three cases we * want to catch. The first is this case: * * |--- cluster # N--| * |--- extent ---| |---- requested region ---| * |==========| * * The second case that we need to test for is this one: * * |--------- cluster # N ----------------| * |--- requested region --| |------- extent ----| * |=======================| * * The third case is when the requested region lies between two extents * within the same cluster: * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| * * In each of the above cases, we need to set the map->m_pblk and * map->m_len so it corresponds to the return the extent labelled as * "|====|" from cluster #N, since it is already in use for data in * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to * signal to ext4_ext_map_blocks() that map->m_pblk should be treated * as a new "allocated" block region. Otherwise, we will return 0 and * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); /* The extent passed in that we are trying to match */ ex_cluster_start = EXT4_B2C(sbi, ee_block); ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* * Check for and handle this case: * * |--------- cluster # N-------------| * |------- extent ----| * |--- requested region ---| * |===========| */ if (map->m_lblk < ee_block) map->m_len = min(map->m_len, ee_block - map->m_lblk); /* * Check for the case where there is already another allocated * block to the right of 'ex' but before the end of the cluster. * * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| */ if (map->m_lblk > ee_block) { ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } /* * Block allocation/map/preallocation routine for extents based files * * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated * if create == 0 and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * * return = 0, if plain look up failed (blocks have not been allocated) * buffer head is unmapped * * return < 0, error case. */ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug(inode, "%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); goto out; } ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; else allocated = ret; goto out; } } /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t hole_start, hole_len; hole_start = map->m_lblk; hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); /* Update hole_len to reflect hole size after map->m_lblk */ if (hole_start != map->m_lblk) hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, hole_len); goto out; } /* * Okay, we need to do block allocation. */ newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && err && get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is * EXT_INIT_MAX_LEN and for an unwritten extent this limit is * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else allocated = map->m_len; /* allocate new block */ ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; /* * We calculate the offset from the beginning of the cluster * for the logical block number, since when we allocate a * physical cluster, the physical block should start at the * same offset from the beginning of the cluster. This is * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else /* disable in-core preallocation for non-regular files */ ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ pblk = newblock + offset; ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (err) { if (allocated_clusters) { int fb_flags = 0; /* * free data blocks we just allocated. * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ ext4_discard_preallocations(inode, 0); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, EXT4_C2B(sbi, allocated_clusters), fb_flags); } goto out; } /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. */ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* * When allocating delayed allocated clusters, simply * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); } else { ext4_lblk_t lblk, len; unsigned int n; /* * When allocating non-delayed allocated clusters * (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by * ext4_nonda_switch), reduce the reserved cluster * count by the number of allocated clusters that * have previously been delayed allocated. Quota * has been claimed by ext4_mb_new_blocks() above, * so release the quota reservations made for any * previously delayed allocated clusters. */ lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); len = allocated_clusters << sbi->s_cluster_bits; n = ext4_es_delayed_clu(inode, lblk, len); if (n > 0) ext4_da_update_reserve_space(inode, (int) n, 0); } } /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); map->m_pblk = pblk; map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; int err = 0; /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); retry: while (len) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { ext4_debug("inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); break; } /* * allow a full retry cycle for any remaining allocations */ retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); ret3 = ext4_journal_stop(handle); ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret > 0 ? ret2 : ret; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; int ret = 0; int flags; int credits; int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. Here, start and partial_begin are inclusive, end and * partial_end are exclusive. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); if (start < offset || end > offset + len) return -EINVAL; partial_begin = offset & ((1 << blkbits) - 1); partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); if (max_blocks < lblk) max_blocks = 0; else max_blocks -= lblk; inode_lock(inode); /* * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out_mutex; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags); if (ret) goto out_mutex; } /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); /* * Prevent page faults from reinstantiating pages we have * released from page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on * disk in case of crash before zeroing trans is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(mapping, start, end - 1); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } if (!partial_begin && !partial_end) goto out_mutex; /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode */ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; if (ext4_should_journal_data(inode)) credits += 2; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); goto out_mutex; } inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); return ret; } /* * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); loff_t new_size = 0; unsigned int max_blocks; int ret = 0; int flags; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; /* * Encrypted inodes can't handle collapse range or insert * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). */ if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_convert_inline_data(inode); inode_unlock(inode); if (ret) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { ret = ext4_insert_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_ZERO_RANGE) { ret = ext4_zero_range(file, offset, len, mode); goto exit; } trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; inode_lock(inode); /* * We only support preallocation for extent-based files only */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out; ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: return ret; } /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to * written extents. * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); if (!handle) { /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ret2 = ext4_mark_inode_dirty(handle, inode); if (credits) { ret3 = ext4_journal_stop(handle); if (unlikely(ret3)) ret2 = ret3; } if (ret <= 0 || ret2) break; } return ret > 0 ? ret2 : ret; } int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { int ret = 0, err = 0; struct ext4_io_end_vec *io_end_vec; /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several * smaller transactions for conversion of each extent separately. */ if (handle) { handle = ext4_journal_start_reserved(handle, EXT4_HT_EXT_CONVERT); if (IS_ERR(handle)) return PTR_ERR(handle); } list_for_each_entry(io_end_vec, &io_end->list_vec, list) { ret = ext4_convert_unwritten_extents(handle, io_end->inode, io_end_vec->offset, io_end_vec->size); if (ret) break; } if (handle) err = ext4_journal_stop(handle); return ret < 0 ? ret : err; } static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ error = ext4_get_inode_loc(inode, &iloc); if (error) return error; physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; brelse(iloc.bh); iomap_type = IOMAP_INLINE; } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; iomap_type = IOMAP_MAPPED; } else { /* no in-inode or external block for xattr, so return -ENOENT */ error = -ENOENT; goto out; } iomap->addr = physical; iomap->offset = 0; iomap->length = length; iomap->type = iomap_type; iomap->flags = 0; out: return error; } static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int error; error = ext4_iomap_xattr_fiemap(inode, iomap); if (error == 0 && (offset >= iomap->length)) error = -ENOENT; return error; } static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { u64 maxbytes; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) maxbytes = inode->i_sb->s_maxbytes; else maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; if (*len == 0) return -EINVAL; if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; return 0; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the * generic check. */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_xattr_ops); } return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk, len_blks; __u64 last_blk; int error = 0; if (ext4_has_inline_data(inode)) { int has_inline; down_read(&EXT4_I(inode)->xattr_sem); has_inline = ext4_has_inline_data(inode); up_read(&EXT4_I(inode)->xattr_sem); if (has_inline) return 0; } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; if (last_blk >= EXT_MAX_BLOCKS) last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information * and pushing extents back to the user. */ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); /* leaf + sb + inode */ credits = 3; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { update = true; /* extent tree + sb + inode */ credits = depth + 2; } restart_credits = ext4_writepage_trans_blocks(inode); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { le32_add_cpu(&ex_start->ee_block, -shift); /* Try to merge to the left. */ if ((ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) && ext4_ext_try_to_merge_right(inode, path, ex_start - 1)) ex_last--; else ex_start++; } else { le32_add_cpu(&ex_last->ee_block, shift); ext4_ext_try_to_merge_right(inode, path, ex_last); ex_last--; } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; if (--depth < 0 || !update) break; } /* Update index too */ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (SHIFT == SHIFT_LEFT) le32_add_cpu(&path[depth].p_idx->ei_block, -shift); else le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; /* we are done if current index is not a starting index */ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) break; depth--; } out: return err; } /* * ext4_ext_shift_extents: * All the extents which lies in the range from @start to the last allocated * block for the @inode are shifted either towards left or right (depending * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t start, ext4_lblk_t shift, enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) goto out; stop = le32_to_cpu(extent->ee_block); /* * For left shifts, make sure the hole on the left is big enough to * accommodate the shift. For right shifts, make sure the last extent * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (extent) { ex_start = le32_to_cpu(extent->ee_block); ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { ex_start = 0; ex_end = 0; } if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { ret = -EINVAL; goto out; } } else { if (shift > EXT_MAX_BLOCKS - (stop + ext4_ext_get_actual_len(extent))) { ret = -EINVAL; goto out; } } /* * In case of left shift, iterator points to start and it is increased * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ again: ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; if (tmp != EXT_MAX_BLOCKS) *iterator = tmp; /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { path = ext4_find_extent(inode, *iterator, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { *iterator = ext4_ext_next_allocated_block(path); continue; } } tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; else { extent = EXT_LAST_EXTENT(path[depth].p_hdr); while (le32_to_cpu(extent->ee_block) >= start) extent--; if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) break; extent++; iterator = NULL; } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); /* iterator can be NULL which means we should break */ if (ret == -EAGAIN) goto again; if (ret) break; } out: ext4_free_ext_path(path); return ret; } /* * ext4_collapse_range: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; loff_t new_size, ioffset; int ret; /* * We need to test this early because xfstests assumes that a * collapse range of (0, 1) will return EOPNOTSUPP if the file * system does not support collapse range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down offset to be aligned with page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* * Write tail of the last page before removed range since it will get * removed from the page cache below. */ ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty * by i_rwsem and invalidate_lock. */ ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } ext4_discard_preallocations(inode, 0); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /* * ext4_insert_range: * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. * The data blocks starting from @offset to the EOF are shifted by @len * towards right to create a hole in the @inode. Inode size is increased * by len bytes. * Returns 0 on success, error otherwise. */ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; int ret = 0, depth, split_flag = 0; loff_t ioffset; /* * We need to test this early because xfstests assumes that an * insert range of (0, 1) will return EOPNOTSUPP if the file * system does not support insert range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Check whether the maximum file size would be exceeded */ if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } /* Offset must be less than i_size */ if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down to align start offset to page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } depth = ext_depth(inode); extent = path[depth].p_ext; if (extent) { ee_start_lblk = le32_to_cpu(extent->ee_block); ee_len = ext4_ext_get_actual_len(extent); /* * If offset_lblk is not the starting block of extent, split * the extent @offset_lblk */ if ((offset_lblk > ee_start_lblk) && (offset_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; ret = ext4_split_extent_at(handle, inode, &path, offset_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { ext4_free_ext_path(path); } ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* * if offset_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /** * ext4_swap_extents() - Swap extents between two inodes * @handle: handle for this transaction * @inode1: First inode * @inode2: Second inode * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; struct ext4_ext_path *path2 = NULL; int replaced_count = 0; BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); ext4_es_remove_extent(inode1, lblk1, count); ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; ext4_lblk_t e1_blk, e2_blk; int e1_len, e2_len, len; int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; finish: count = 0; goto repeat; } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); path2 = NULL; goto finish; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto finish; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); e1_len = ext4_ext_get_actual_len(ex1); e2_len = ext4_ext_get_actual_len(ex2); /* Hole handling */ if (!in_range(lblk1, e1_blk, e1_len) || !in_range(lblk2, e2_blk, e2_len)) { ext4_lblk_t next1, next2; /* if hole after extent, then go to next extent */ next1 = ext4_ext_next_allocated_block(path1); next2 = ext4_ext_next_allocated_block(path2); /* If hole before extent, then shift to that extent */ if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) len = next2 - lblk2; if (len > count) len = count; lblk1 += len; lblk2 += len; count -= len; goto repeat; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1, 0); if (unlikely(*erp)) goto finish; } if (e2_blk < lblk2) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2, 0); if (unlikely(*erp)) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; /* Prepare right boundary */ len = count; if (len > e1_blk + e1_len - lblk1) len = e1_blk + e1_len - lblk1; if (len > e2_blk + e2_len - lblk2) len = e2_blk + e2_len - lblk2; if (len != e1_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1 + len, 0); if (unlikely(*erp)) goto finish; } if (len != e2_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2 + len, 0); if (*erp) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); ex1->ee_len = cpu_to_le16(e2_len); ex2->ee_len = cpu_to_le16(e1_len); if (unwritten) ext4_ext_mark_unwritten(ex2); if (ext4_ext_is_unwritten(&tmp_ex)) ext4_ext_mark_unwritten(ex1); ext4_ext_try_to_merge(handle, inode2, path2, ex2); ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* * Looks scarry ah..? second inode already points to new blocks, * and it was successfully dirtied. But luckily error may happen * only due to journal error, so full transaction will be * aborted anyway. */ if (unlikely(*erp)) goto finish; lblk1 += len; lblk2 += len; replaced_count += len; count -= len; repeat: ext4_free_ext_path(path1); ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; } /* * ext4_clu_mapped - determine whether any block in a logical cluster has * been mapped to a physical cluster * * @inode - file containing the logical cluster * @lclu - logical cluster of interest * * Returns 1 if any block in the logical cluster is mapped, signifying * that a physical cluster has been allocated for it. Otherwise, * returns 0. Can also return negative error codes. Derived from * ext4_ext_map_blocks(). */ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_ext_path *path; int depth, mapped = 0, err = 0; struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; /* * if data can be stored inline, the logical cluster isn't * mapped - no physical clusters have been allocated, and the * file has no extents */ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * A consistent leaf must not be empty. This situation is possible, * though, _during_ tree modification, and it's why an assert can't * be put in ext4_find_extent(). */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address - lblock: %lu, depth: %d, pblock: %lld", (unsigned long) EXT4_C2B(sbi, lclu), depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } extent = path[depth].p_ext; /* can't be mapped if the extent tree is empty */ if (extent == NULL) goto out; first_lblk = le32_to_cpu(extent->ee_block); first_lclu = EXT4_B2C(sbi, first_lblk); /* * Three possible outcomes at this point - found extent spanning * the target cluster, to the left of the target cluster, or to the * right of the target cluster. The first two cases are handled here. * The last case indicates the target cluster is not mapped. */ if (lclu >= first_lclu) { last_lclu = EXT4_B2C(sbi, first_lblk + ext4_ext_get_actual_len(extent) - 1); if (lclu <= last_lclu) { mapped = 1; } else { first_lblk = ext4_ext_next_allocated_block(path); first_lclu = EXT4_B2C(sbi, first_lblk); if (lclu == first_lclu) mapped = 1; } } out: ext4_free_ext_path(path); return err ? err : mapped; } /* * Updates physical block address and unwritten status of extent * starting at lblk start and of len. If such an extent doesn't exist, * this function splits the extent tree appropriately to create an * extent like this. This function is called in the fast commit * replay path. Returns 0 on success and error on failure. */ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { struct ext4_ext_path *path = NULL, *ppath; struct ext4_extent *ex; int ret; path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ret = -EFSCORRUPTED; goto out; } if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ ppath = path; down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -1; ppath = path; ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -EINVAL; ex = path[path->p_depth].p_ext; } } if (unwritten) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); ext4_ext_store_pblock(ex, pblk); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } /* Try to shrink the extent tree */ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t old_cur, cur = 0; while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) return; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } old_cur = cur; cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); if (cur <= old_cur) cur = old_cur + 1; ext4_ext_try_to_merge(NULL, inode, path, ex); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); ext4_free_ext_path(path); } } /* Check if *cur is a hole and if it is, skip it */ static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; map.m_lblk = *cur; map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret != 0) return 0; *cur = *cur + map.m_len; return 0; } /* Count number of blocks used by this inode and update i_blocks */ int ext4_ext_replay_set_iblocks(struct inode *inode) { struct ext4_ext_path *path = NULL, *path2 = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int numblks = 0, i, ret = 0; ext4_fsblk_t cmp1, cmp2; struct ext4_map_blocks map; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) numblks += ret; cur = cur + map.m_len; } /* * Count the number of extent tree blocks. We do it by looking up * two successive extents and determining the difference between * their paths. When path is different for 2 successive extents * we compare the blocks in the path at each level and increment * iblocks by total number of differences found. */ cur = 0; ret = skip_hole(inode, &cur); if (ret < 0) goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) cmp1 = path[i].p_bh ? path[i].p_bh->b_blocknr : 0; if (i <= path2->p_depth) cmp2 = path2[i].p_bh ? path2[i].p_bh->b_blocknr : 0; if (cmp1 != cmp2 && cmp2 != 0) numblks++; } ext4_free_ext_path(path); ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); return 0; } int ext4_ext_clear_bb(struct inode *inode) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int j, ret = 0; struct ext4_map_blocks map; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) return 0; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR_OR_NULL(path)) { for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } return 0; } |
| 631 628 631 628 44 14 14 14 14 14 14 15 2 2 1 1 1 1 1 1 14 15 15 15 14 11 11 38 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | // SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * * Originally from kernel/power/process.c */ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/kthread.h> /* total number of freezing conditions in effect */ DEFINE_STATIC_KEY_FALSE(freezer_active); EXPORT_SYMBOL(freezer_active); /* * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; bool pm_nosig_freezing; /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); /** * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. */ bool freezing_slow_path(struct task_struct *p) { if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) return true; if (pm_freezing && !(p->flags & PF_KTHREAD)) return true; return false; } EXPORT_SYMBOL(freezing_slow_path); bool frozen(struct task_struct *p) { return READ_ONCE(p->__state) & TASK_FROZEN; } /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { unsigned int state = get_current_state(); bool was_frozen = false; pr_debug("%s entered refrigerator\n", current->comm); WARN_ON_ONCE(state && !(state & TASK_NORMAL)); for (;;) { bool freeze; raw_spin_lock_irq(¤t->pi_lock); set_current_state(TASK_FROZEN); /* unstale saved_state so that __thaw_task() will wake us up */ current->saved_state = TASK_RUNNING; raw_spin_unlock_irq(¤t->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); if (!freeze) break; was_frozen = true; schedule(); } __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); return was_frozen; } EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; if (lock_task_sighand(p, &flags)) { signal_wake_up(p, 0); unlock_task_sighand(p, &flags); } } static int __set_task_frozen(struct task_struct *p, void *arg) { unsigned int state = READ_ONCE(p->__state); if (p->on_rq) return 0; if (p != current && task_curr(p)) return 0; if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) return 0; /* * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they * can suffer spurious wakeups. */ if (state & TASK_FREEZABLE) WARN_ON_ONCE(!(state & TASK_NORMAL)); #ifdef CONFIG_LOCKDEP /* * It's dangerous to freeze with locks held; there be dragons there. */ if (!(state & __TASK_FREEZABLE_UNSAFE)) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } static bool __freeze_task(struct task_struct *p) { /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ return task_call_func(p, __set_task_frozen, NULL); } /** * freeze_task - send a freeze request to given task * @p: task to send the request to * * If @p is freezing, the freeze request is sent either by sending a fake * signal (if it's not a kernel thread) or waking it up (if it's a kernel * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise */ bool freeze_task(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } /* * Restore the saved_state before the task entered freezer. For typical task * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state * is restored unless they got an expected wakeup (see ttwu_state_match()). * Returns 1 if the task state was restored. */ static int __restore_freezer_state(struct task_struct *p, void *arg) { unsigned int state = p->saved_state; if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); p->saved_state = TASK_RUNNING; return 1; } return 0; } void __thaw_task(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (WARN_ON_ONCE(freezing(p))) goto unlock; if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) goto unlock; wake_up_state(p, TASK_FROZEN); unlock: spin_unlock_irqrestore(&freezer_lock, flags); } /** * set_freezable - make %current freezable * * Mark %current freezable and enter refrigerator if necessary. */ bool set_freezable(void) { might_sleep(); /* * Modify flags while holding freezer_lock. This ensures the * freezer notices that we aren't frozen yet or the freezing * condition is visible to try_to_freeze() below. */ spin_lock_irq(&freezer_lock); current->flags &= ~PF_NOFREEZE; spin_unlock_irq(&freezer_lock); return try_to_freeze(); } EXPORT_SYMBOL(set_freezable); |
| 18771 18770 18768 12424 12423 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | // SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register; mutex_lock(&sched_register_mutex); sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } |
| 25 25 93 79 70 94 70 95 129 129 6 129 129 66 79 79 70 19 79 115 46 11 17 33 46 143 93 10 93 93 27 93 62 93 112 79 1 80 80 79 46 143 80 97 97 14 12 66 64 66 35 115 115 115 115 98 115 115 115 115 25 3 5 25 28 14 28 28 143 115 25 25 25 25 25 24 24 143 123 123 123 123 2 123 123 35 150 27 129 129 129 16 16 16 16 16 145 145 16 16 145 129 129 129 145 12 3 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/percpu.h> #include "bpf_lru_list.h" #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) static int get_next_cpu(int cpu) { cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_possible_mask); return cpu; } /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_FREE_LIST_IDX]; } static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; } /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { return READ_ONCE(node->ref); } static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) { WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]++; } static void bpf_lru_list_count_dec(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]--; } static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; /* If the removing node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; list_move(&node->list, free_list); } /* Move nodes from local list to the LRU list */ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } /* Move nodes between or within active and inactive list (like * active to inactive, inactive to active or tail of active back to * the head of active). */ static void __bpf_lru_node_move(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; if (node->type != tgt_type) { bpf_lru_list_count_dec(l, node->type); bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; list_move(&node->list, &l->lists[tgt_type]); } static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < l->counts[BPF_LRU_LIST_T_ACTIVE]; } /* Rotate the active list: * 1. Start from tail * 2. If the node has the ref bit set, it will be rotated * back to the head of active list with the ref bit cleared. * Give this node one more chance to survive in the active list. * 3. If the ref bit is not set, move it to the head of the * inactive list. * 4. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; struct bpf_lru_node *node, *tmp_node, *first_node; unsigned int i = 0; first_node = list_first_entry(active, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, active, list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); if (++i == lru->nr_scans || node == first_node) break; } } /* Rotate the inactive list. It starts from the next_inactive_rotation * 1. If the node has ref bit set, it will be moved to the head * of active list with the ref bit cleared. * 2. If the node does not have ref bit set, it will leave it * at its current location (i.e. do nothing) so that it can * be considered during the next inactive_shrink. * 3. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; if (list_empty(inactive)) return; last = l->next_inactive_rotation->next; if (last == inactive) last = last->next; cur = l->next_inactive_rotation; while (i < lru->nr_scans) { if (cur == inactive) { cur = cur->prev; continue; } node = list_entry(cur, struct bpf_lru_node, list); next = cur->prev; if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); if (cur == last) break; cur = next; i++; } l->next_inactive_rotation = next; } /* Shrink the inactive list. It starts from the tail of the * inactive list and only move the nodes without the ref bit * set to the designated free list. */ static unsigned int __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); } else if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) break; } if (++i == lru->nr_scans) break; } return nshrinked; } /* 1. Rotate the active list (if needed) * 2. Always rotate the inactive list */ static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) { if (bpf_lru_list_inactive_low(l)) __bpf_lru_list_rotate_active(lru, l); __bpf_lru_list_rotate_inactive(lru, l); } /* Calls __bpf_lru_list_shrink_inactive() to shrink some * ref-bit-cleared nodes and move them to the designated * free list. * * If it cannot get a free node after calling * __bpf_lru_list_shrink_inactive(). It will just remove * one node from either inactive or active list without * honoring the ref-bit. It prefers inactive list to active * list in this situation. */ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct bpf_lru_node *node, *tmp_node; struct list_head *force_shrink_list; unsigned int nshrinked; nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, free_list, tgt_free_type); if (nshrinked) return nshrinked; /* Do a force shrink by ignoring the reference bit */ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; else force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; } } return 0; } /* Flush the nodes from the local pending list to the LRU list */ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, local_pending_list(loc_l), list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_INACTIVE); } } static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { unsigned long flags; if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; raw_spin_lock(&l->lock); __local_list_flush(l, loc_l); __bpf_lru_list_rotate(lru, l); list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == LOCAL_FREE_TARGET) break; } if (nfree < LOCAL_FREE_TARGET) __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); raw_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, struct bpf_lru_node *node, u32 hash) { *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; node = list_first_entry_or_null(local_free_list(loc_l), struct bpf_lru_node, list); if (node) list_del(&node->list); return node; } static struct bpf_lru_node * __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ list_for_each_entry_reverse(node, local_pending_list(loc_l), list) { if ((!bpf_lru_node_is_ref(node) || force) && lru->del_from_htab(lru->del_arg, node)) { list_del(&node->list); return node; } } if (!force) { force = true; goto ignore_ref; } return NULL; } static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct list_head *free_list; struct bpf_lru_node *node = NULL; struct bpf_lru_list *l; unsigned long flags; int cpu = raw_smp_processor_id(); l = per_cpu_ptr(lru->percpu_lru, cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_list_rotate(lru, l); free_list = &l->lists[BPF_LRU_LIST_T_FREE]; if (list_empty(free_list)) __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, BPF_LRU_LIST_T_FREE); if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } raw_spin_unlock_irqrestore(&l->lock, flags); return node; } static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; struct bpf_lru_node *node; int steal, first_steal; unsigned long flags; int cpu = raw_smp_processor_id(); loc_l = per_cpu_ptr(clru->local_list, cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); node = __local_list_pop_free(loc_l); if (!node) { bpf_lru_list_pop_free_to_local(lru, loc_l); node = __local_list_pop_free(loc_l); } if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; /* No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. */ first_steal = loc_l->next_steal; steal = first_steal; do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); raw_spin_lock_irqsave(&steal_loc_l->lock, flags); node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); steal = get_next_cpu(steal); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; } struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) { if (lru->percpu) return bpf_percpu_lru_pop_free(lru, hash); else return bpf_common_lru_pop_free(lru, hash); } static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { u8 node_type = READ_ONCE(node->type); unsigned long flags; if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { raw_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; } check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } static void bpf_percpu_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { struct bpf_lru_list *l; unsigned long flags; l = per_cpu_ptr(lru->percpu_lru, node->cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { if (lru->percpu) bpf_percpu_lru_push_free(lru, node); else bpf_common_lru_push_free(lru, node); } static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; for (i = 0; i < nr_elems; i++) { struct bpf_lru_node *node; node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { u32 i, pcpu_entries; int cpu; struct bpf_lru_list *l; pcpu_entries = nr_elems / num_possible_cpus(); i = 0; for_each_possible_cpu(cpu) { struct bpf_lru_node *node; l = per_cpu_ptr(lru->percpu_lru, cpu); again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; if (i == nr_elems) break; if (i % pcpu_entries) goto again; } } void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, nr_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, nr_elems); } static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]); loc_l->next_steal = cpu; raw_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) { int i; for (i = 0; i < NR_BPF_LRU_LIST_T; i++) INIT_LIST_HEAD(&l->lists[i]); for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) l->counts[i] = 0; l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; raw_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; if (percpu) { lru->percpu_lru = alloc_percpu(struct bpf_lru_list); if (!lru->percpu_lru) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_list *l; l = per_cpu_ptr(lru->percpu_lru, cpu); bpf_lru_list_init(l); } lru->nr_scans = PERCPU_NR_SCANS; } else { struct bpf_common_lru *clru = &lru->common_lru; clru->local_list = alloc_percpu(struct bpf_lru_locallist); if (!clru->local_list) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(clru->local_list, cpu); bpf_lru_locallist_init(loc_l, cpu); } bpf_lru_list_init(&clru->lru_list); lru->nr_scans = LOCAL_NR_SCANS; } lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; return 0; } void bpf_lru_destroy(struct bpf_lru *lru) { if (lru->percpu) free_percpu(lru->percpu_lru); else free_percpu(lru->common_lru.local_list); } |
| 1797 254 254 1861 1709 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 | // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/drivers/net/netconsole.c * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * * This file contains the implementation of an IRQ-safe, crash-safe * kernel console implementation that outputs kernel messages to the * network. * * Modification history: * * 2001-09-17 started by Ingo Molnar. * 2003-08-11 2.6 port by Matt Mackall * simplified options * generic card hooks * works non-modular * 2003-09-07 rewritten with netpoll api */ /**************************************************************** * ****************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/console.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netpoll.h> #include <linux/inet.h> #include <linux/configfs.h> #include <linux/etherdevice.h> #include <linux/utsname.h> MODULE_AUTHOR("Maintainer: Matt Mackall <mpm@selenic.com>"); MODULE_DESCRIPTION("Console driver for network interfaces"); MODULE_LICENSE("GPL"); #define MAX_PARAM_LENGTH 256 #define MAX_PRINT_CHUNK 1000 static char config[MAX_PARAM_LENGTH]; module_param_string(netconsole, config, MAX_PARAM_LENGTH, 0); MODULE_PARM_DESC(netconsole, " netconsole=[src-port]@[src-ip]/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr]"); static bool oops_only = false; module_param(oops_only, bool, 0600); MODULE_PARM_DESC(oops_only, "Only log oops messages"); #define NETCONSOLE_PARAM_TARGET_PREFIX "cmdline" #ifndef MODULE static int __init option_setup(char *opt) { strscpy(config, opt, MAX_PARAM_LENGTH); return 1; } __setup("netconsole=", option_setup); #endif /* MODULE */ /* Linked list of all configured targets */ static LIST_HEAD(target_list); /* This needs to be a spinlock because write_msg() cannot sleep */ static DEFINE_SPINLOCK(target_list_lock); /* * Console driver for extended netconsoles. Registered on the first use to * avoid unnecessarily enabling ext message formatting. */ static struct console netconsole_ext; /** * struct netconsole_target - Represents a configured netconsole target. * @list: Links this target into the target_list. * @item: Links us into the configfs subsystem hierarchy. * @enabled: On / off knob to enable / disable target. * Visible from userspace (read-write). * We maintain a strict 1:1 correspondence between this and * whether the corresponding netpoll is active or inactive. * Also, other parameters of a target may be modified at * runtime only when it is disabled (enabled == 0). * @extended: Denotes whether console is extended or not. * @release: Denotes whether kernel release version should be prepended * to the message. Depends on extended console. * @np: The netpoll structure for this target. * Contains the other userspace visible parameters: * dev_name (read-write) * local_port (read-write) * remote_port (read-write) * local_ip (read-write) * remote_ip (read-write) * local_mac (read-only) * remote_mac (read-write) */ struct netconsole_target { struct list_head list; #ifdef CONFIG_NETCONSOLE_DYNAMIC struct config_item item; #endif bool enabled; bool extended; bool release; struct netpoll np; }; #ifdef CONFIG_NETCONSOLE_DYNAMIC static struct configfs_subsystem netconsole_subsys; static DEFINE_MUTEX(dynamic_netconsole_mutex); static int __init dynamic_netconsole_init(void) { config_group_init(&netconsole_subsys.su_group); mutex_init(&netconsole_subsys.su_mutex); return configfs_register_subsystem(&netconsole_subsys); } static void __exit dynamic_netconsole_exit(void) { configfs_unregister_subsystem(&netconsole_subsys); } /* * Targets that were created by parsing the boot/module option string * do not exist in the configfs hierarchy (and have NULL names) and will * never go away, so make these a no-op for them. */ static void netconsole_target_get(struct netconsole_target *nt) { if (config_item_name(&nt->item)) config_item_get(&nt->item); } static void netconsole_target_put(struct netconsole_target *nt) { if (config_item_name(&nt->item)) config_item_put(&nt->item); } #else /* !CONFIG_NETCONSOLE_DYNAMIC */ static int __init dynamic_netconsole_init(void) { return 0; } static void __exit dynamic_netconsole_exit(void) { } /* * No danger of targets going away from under us when dynamic * reconfigurability is off. */ static void netconsole_target_get(struct netconsole_target *nt) { } static void netconsole_target_put(struct netconsole_target *nt) { } static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Allocate and initialize with defaults. * Note that these targets get their config_item fields zeroed-out. */ static struct netconsole_target *alloc_and_init(void) { struct netconsole_target *nt; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return nt; if (IS_ENABLED(CONFIG_NETCONSOLE_EXTENDED_LOG)) nt->extended = true; if (IS_ENABLED(CONFIG_NETCONSOLE_PREPEND_RELEASE)) nt->release = true; nt->np.name = "netconsole"; strscpy(nt->np.dev_name, "eth0", IFNAMSIZ); nt->np.local_port = 6665; nt->np.remote_port = 6666; eth_broadcast_addr(nt->np.remote_mac); return nt; } #ifdef CONFIG_NETCONSOLE_DYNAMIC /* * Our subsystem hierarchy is: * * /sys/kernel/config/netconsole/ * | * <target>/ * | enabled * | release * | dev_name * | local_port * | remote_port * | local_ip * | remote_ip * | local_mac * | remote_mac * | * <target>/... */ static struct netconsole_target *to_target(struct config_item *item) { return item ? container_of(item, struct netconsole_target, item) : NULL; } /* * Attribute operations for netconsole_target. */ static ssize_t enabled_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->enabled); } static ssize_t extended_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->extended); } static ssize_t release_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->release); } static ssize_t dev_name_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%s\n", to_target(item)->np.dev_name); } static ssize_t local_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.local_port); } static ssize_t remote_port_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%d\n", to_target(item)->np.remote_port); } static ssize_t local_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.local_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.local_ip); } static ssize_t remote_ip_show(struct config_item *item, char *buf) { struct netconsole_target *nt = to_target(item); if (nt->np.ipv6) return sysfs_emit(buf, "%pI6c\n", &nt->np.remote_ip.in6); else return sysfs_emit(buf, "%pI4\n", &nt->np.remote_ip); } static ssize_t local_mac_show(struct config_item *item, char *buf) { struct net_device *dev = to_target(item)->np.dev; static const u8 bcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return sysfs_emit(buf, "%pM\n", dev ? dev->dev_addr : bcast); } static ssize_t remote_mac_show(struct config_item *item, char *buf) { return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac); } /* * This one is special -- targets created through the configfs interface * are not enabled (and the corresponding netpoll activated) by default. * The user is expected to set the desired parameters first (which * would enable him to dynamically add new netpoll targets for new * network interfaces as and when they come up). */ static ssize_t enabled_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); unsigned long flags; bool enabled; int err; mutex_lock(&dynamic_netconsole_mutex); err = kstrtobool(buf, &enabled); if (err) goto out_unlock; err = -EINVAL; if ((bool)enabled == nt->enabled) { pr_info("network logging has already %s\n", nt->enabled ? "started" : "stopped"); goto out_unlock; } if (enabled) { /* true */ if (nt->release && !nt->extended) { pr_err("Not enabling netconsole. Release feature requires extended log message"); goto out_unlock; } if (nt->extended && !console_is_registered(&netconsole_ext)) register_console(&netconsole_ext); /* * Skip netpoll_parse_options() -- all the attributes are * already configured via configfs. Just print them out. */ netpoll_print_options(&nt->np); err = netpoll_setup(&nt->np); if (err) goto out_unlock; pr_info("network logging started\n"); } else { /* false */ /* We need to disable the netconsole before cleaning it up * otherwise we might end up in write_msg() with * nt->np.dev == NULL and nt->enabled == true */ spin_lock_irqsave(&target_list_lock, flags); nt->enabled = false; spin_unlock_irqrestore(&target_list_lock, flags); netpoll_cleanup(&nt->np); } nt->enabled = enabled; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return err; } static ssize_t release_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool release; int err; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); err = -EINVAL; goto out_unlock; } err = kstrtobool(buf, &release); if (err) goto out_unlock; nt->release = release; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return err; } static ssize_t extended_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); bool extended; int err; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); err = -EINVAL; goto out_unlock; } err = kstrtobool(buf, &extended); if (err) goto out_unlock; nt->extended = extended; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return err; } static ssize_t dev_name_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); size_t len; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } strscpy(nt->np.dev_name, buf, IFNAMSIZ); /* Get rid of possible trailing newline from echo(1) */ len = strnlen(nt->np.dev_name, IFNAMSIZ); if (nt->np.dev_name[len - 1] == '\n') nt->np.dev_name[len - 1] = '\0'; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); } static ssize_t local_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); int rv = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); goto out_unlock; } rv = kstrtou16(buf, 10, &nt->np.local_port); if (rv < 0) goto out_unlock; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return rv; } static ssize_t remote_port_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); int rv = -EINVAL; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); goto out_unlock; } rv = kstrtou16(buf, 10, &nt->np.remote_port); if (rv < 0) goto out_unlock; mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return rv; } static ssize_t local_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); goto out_unlock; } if (strnchr(buf, count, ':')) { const char *end; if (in6_pton(buf, count, nt->np.local_ip.in6.s6_addr, -1, &end) > 0) { if (*end && *end != '\n') { pr_err("invalid IPv6 address at: <%c>\n", *end); goto out_unlock; } nt->np.ipv6 = true; } else goto out_unlock; } else { if (!nt->np.ipv6) { nt->np.local_ip.ip = in_aton(buf); } else goto out_unlock; } mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } static ssize_t remote_ip_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); goto out_unlock; } if (strnchr(buf, count, ':')) { const char *end; if (in6_pton(buf, count, nt->np.remote_ip.in6.s6_addr, -1, &end) > 0) { if (*end && *end != '\n') { pr_err("invalid IPv6 address at: <%c>\n", *end); goto out_unlock; } nt->np.ipv6 = true; } else goto out_unlock; } else { if (!nt->np.ipv6) { nt->np.remote_ip.ip = in_aton(buf); } else goto out_unlock; } mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } static ssize_t remote_mac_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); u8 remote_mac[ETH_ALEN]; mutex_lock(&dynamic_netconsole_mutex); if (nt->enabled) { pr_err("target (%s) is enabled, disable to update parameters\n", config_item_name(&nt->item)); goto out_unlock; } if (!mac_pton(buf, remote_mac)) goto out_unlock; if (buf[3 * ETH_ALEN - 1] && buf[3 * ETH_ALEN - 1] != '\n') goto out_unlock; memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN); mutex_unlock(&dynamic_netconsole_mutex); return strnlen(buf, count); out_unlock: mutex_unlock(&dynamic_netconsole_mutex); return -EINVAL; } CONFIGFS_ATTR(, enabled); CONFIGFS_ATTR(, extended); CONFIGFS_ATTR(, dev_name); CONFIGFS_ATTR(, local_port); CONFIGFS_ATTR(, remote_port); CONFIGFS_ATTR(, local_ip); CONFIGFS_ATTR(, remote_ip); CONFIGFS_ATTR_RO(, local_mac); CONFIGFS_ATTR(, remote_mac); CONFIGFS_ATTR(, release); static struct configfs_attribute *netconsole_target_attrs[] = { &attr_enabled, &attr_extended, &attr_release, &attr_dev_name, &attr_local_port, &attr_remote_port, &attr_local_ip, &attr_remote_ip, &attr_local_mac, &attr_remote_mac, NULL, }; /* * Item operations and type for netconsole_target. */ static void netconsole_target_release(struct config_item *item) { kfree(to_target(item)); } static struct configfs_item_operations netconsole_target_item_ops = { .release = netconsole_target_release, }; static const struct config_item_type netconsole_target_type = { .ct_attrs = netconsole_target_attrs, .ct_item_ops = &netconsole_target_item_ops, .ct_owner = THIS_MODULE, }; static struct netconsole_target *find_cmdline_target(const char *name) { struct netconsole_target *nt, *ret = NULL; unsigned long flags; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!strcmp(nt->item.ci_name, name)) { ret = nt; break; } } spin_unlock_irqrestore(&target_list_lock, flags); return ret; } /* * Group operations and type for netconsole_subsys. */ static struct config_item *make_netconsole_target(struct config_group *group, const char *name) { struct netconsole_target *nt; unsigned long flags; /* Checking if a target by this name was created at boot time. If so, * attach a configfs entry to that target. This enables dynamic * control. */ if (!strncmp(name, NETCONSOLE_PARAM_TARGET_PREFIX, strlen(NETCONSOLE_PARAM_TARGET_PREFIX))) { nt = find_cmdline_target(name); if (nt) return &nt->item; } nt = alloc_and_init(); if (!nt) return ERR_PTR(-ENOMEM); /* Initialize the config_item member */ config_item_init_type_name(&nt->item, name, &netconsole_target_type); /* Adding, but it is disabled */ spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); return &nt->item; } static void drop_netconsole_target(struct config_group *group, struct config_item *item) { unsigned long flags; struct netconsole_target *nt = to_target(item); spin_lock_irqsave(&target_list_lock, flags); list_del(&nt->list); spin_unlock_irqrestore(&target_list_lock, flags); /* * The target may have never been enabled, or was manually disabled * before being removed so netpoll may have already been cleaned up. */ if (nt->enabled) netpoll_cleanup(&nt->np); config_item_put(&nt->item); } static struct configfs_group_operations netconsole_subsys_group_ops = { .make_item = make_netconsole_target, .drop_item = drop_netconsole_target, }; static const struct config_item_type netconsole_subsys_type = { .ct_group_ops = &netconsole_subsys_group_ops, .ct_owner = THIS_MODULE, }; /* The netconsole configfs subsystem */ static struct configfs_subsystem netconsole_subsys = { .su_group = { .cg_item = { .ci_namebuf = "netconsole", .ci_type = &netconsole_subsys_type, }, }, }; static void populate_configfs_item(struct netconsole_target *nt, int cmdline_count) { char target_name[16]; snprintf(target_name, sizeof(target_name), "%s%d", NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count); config_item_init_type_name(&nt->item, target_name, &netconsole_target_type); } #endif /* CONFIG_NETCONSOLE_DYNAMIC */ /* Handle network interface device notifications */ static int netconsole_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { unsigned long flags; struct netconsole_target *nt; struct net_device *dev = netdev_notifier_info_to_dev(ptr); bool stopped = false; if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER || event == NETDEV_RELEASE || event == NETDEV_JOIN)) goto done; spin_lock_irqsave(&target_list_lock, flags); restart: list_for_each_entry(nt, &target_list, list) { netconsole_target_get(nt); if (nt->np.dev == dev) { switch (event) { case NETDEV_CHANGENAME: strscpy(nt->np.dev_name, dev->name, IFNAMSIZ); break; case NETDEV_RELEASE: case NETDEV_JOIN: case NETDEV_UNREGISTER: /* rtnl_lock already held * we might sleep in __netpoll_cleanup() */ spin_unlock_irqrestore(&target_list_lock, flags); __netpoll_cleanup(&nt->np); spin_lock_irqsave(&target_list_lock, flags); netdev_put(nt->np.dev, &nt->np.dev_tracker); nt->np.dev = NULL; nt->enabled = false; stopped = true; netconsole_target_put(nt); goto restart; } } netconsole_target_put(nt); } spin_unlock_irqrestore(&target_list_lock, flags); if (stopped) { const char *msg = "had an event"; switch (event) { case NETDEV_UNREGISTER: msg = "unregistered"; break; case NETDEV_RELEASE: msg = "released slaves"; break; case NETDEV_JOIN: msg = "is joining a master device"; break; } pr_info("network logging stopped on interface %s as it %s\n", dev->name, msg); } done: return NOTIFY_DONE; } static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; /** * send_ext_msg_udp - send extended log message to target * @nt: target to send message to * @msg: extended log message to send * @msg_len: length of message * * Transfer extended log @msg to @nt. If @msg is longer than * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with * ncfrag header field added to identify them. */ static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg, int msg_len) { static char buf[MAX_PRINT_CHUNK]; /* protected by target_list_lock */ const char *header, *body; int offset = 0; int header_len, body_len; const char *msg_ready = msg; const char *release; int release_len = 0; if (nt->release) { release = init_utsname()->release; release_len = strlen(release) + 1; } if (msg_len + release_len <= MAX_PRINT_CHUNK) { /* No fragmentation needed */ if (nt->release) { scnprintf(buf, MAX_PRINT_CHUNK, "%s,%s", release, msg); msg_len += release_len; msg_ready = buf; } netpoll_send_udp(&nt->np, msg_ready, msg_len); return; } /* need to insert extra header fields, detect header and body */ header = msg; body = memchr(msg, ';', msg_len); if (WARN_ON_ONCE(!body)) return; header_len = body - header; body_len = msg_len - header_len - 1; body++; /* * Transfer multiple chunks with the following extra header. * "ncfrag=<byte-offset>/<total-bytes>" */ if (nt->release) scnprintf(buf, MAX_PRINT_CHUNK, "%s,", release); memcpy(buf + release_len, header, header_len); header_len += release_len; while (offset < body_len) { int this_header = header_len; int this_chunk; this_header += scnprintf(buf + this_header, sizeof(buf) - this_header, ",ncfrag=%d/%d;", offset, body_len); this_chunk = min(body_len - offset, MAX_PRINT_CHUNK - this_header); if (WARN_ON_ONCE(this_chunk <= 0)) return; memcpy(buf + this_header, body + offset, this_chunk); netpoll_send_udp(&nt->np, buf, this_header + this_chunk); offset += this_chunk; } } static void write_ext_msg(struct console *con, const char *msg, unsigned int len) { struct netconsole_target *nt; unsigned long flags; if ((oops_only && !oops_in_progress) || list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) if (nt->extended && nt->enabled && netif_running(nt->np.dev)) send_ext_msg_udp(nt, msg, len); spin_unlock_irqrestore(&target_list_lock, flags); } static void write_msg(struct console *con, const char *msg, unsigned int len) { int frag, left; unsigned long flags; struct netconsole_target *nt; const char *tmp; if (oops_only && !oops_in_progress) return; /* Avoid taking lock and disabling interrupts unnecessarily */ if (list_empty(&target_list)) return; spin_lock_irqsave(&target_list_lock, flags); list_for_each_entry(nt, &target_list, list) { if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) { /* * We nest this inside the for-each-target loop above * so that we're able to get as much logging out to * at least one target if we die inside here, instead * of unnecessarily keeping all targets in lock-step. */ tmp = msg; for (left = len; left;) { frag = min(left, MAX_PRINT_CHUNK); netpoll_send_udp(&nt->np, tmp, frag); tmp += frag; left -= frag; } } } spin_unlock_irqrestore(&target_list_lock, flags); } /* Allocate new target (from boot/module param) and setup netpoll for it */ static struct netconsole_target *alloc_param_target(char *target_config, int cmdline_count) { struct netconsole_target *nt; int err; nt = alloc_and_init(); if (!nt) { err = -ENOMEM; goto fail; } if (*target_config == '+') { nt->extended = true; target_config++; } if (*target_config == 'r') { if (!nt->extended) { pr_err("Netconsole configuration error. Release feature requires extended log message"); err = -EINVAL; goto fail; } nt->release = true; target_config++; } /* Parse parameters and setup netpoll */ err = netpoll_parse_options(&nt->np, target_config); if (err) goto fail; err = netpoll_setup(&nt->np); if (err) goto fail; populate_configfs_item(nt, cmdline_count); nt->enabled = true; return nt; fail: kfree(nt); return ERR_PTR(err); } /* Cleanup netpoll for given target (from boot/module param) and free it */ static void free_param_target(struct netconsole_target *nt) { netpoll_cleanup(&nt->np); kfree(nt); } static struct console netconsole_ext = { .name = "netcon_ext", .flags = CON_ENABLED | CON_EXTENDED, .write = write_ext_msg, }; static struct console netconsole = { .name = "netcon", .flags = CON_ENABLED, .write = write_msg, }; static int __init init_netconsole(void) { int err; struct netconsole_target *nt, *tmp; unsigned int count = 0; bool extended = false; unsigned long flags; char *target_config; char *input = config; if (strnlen(input, MAX_PARAM_LENGTH)) { while ((target_config = strsep(&input, ";"))) { nt = alloc_param_target(target_config, count); if (IS_ERR(nt)) { err = PTR_ERR(nt); goto fail; } /* Dump existing printks when we register */ if (nt->extended) { extended = true; netconsole_ext.flags |= CON_PRINTBUFFER; } else { netconsole.flags |= CON_PRINTBUFFER; } spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); count++; } } err = register_netdevice_notifier(&netconsole_netdev_notifier); if (err) goto fail; err = dynamic_netconsole_init(); if (err) goto undonotifier; if (extended) register_console(&netconsole_ext); register_console(&netconsole); pr_info("network logging started\n"); return err; undonotifier: unregister_netdevice_notifier(&netconsole_netdev_notifier); fail: pr_err("cleaning up\n"); /* * Remove all targets and destroy them (only targets created * from the boot/module option exist here). Skipping the list * lock is safe here, and netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } return err; } static void __exit cleanup_netconsole(void) { struct netconsole_target *nt, *tmp; if (console_is_registered(&netconsole_ext)) unregister_console(&netconsole_ext); unregister_console(&netconsole); dynamic_netconsole_exit(); unregister_netdevice_notifier(&netconsole_netdev_notifier); /* * Targets created via configfs pin references on our module * and would first be rmdir(2)'ed from userspace. We reach * here only when they are already destroyed, and only those * created from the boot/module option are left, so remove and * destroy them. Skipping the list lock is safe here, and * netpoll_cleanup() will sleep. */ list_for_each_entry_safe(nt, tmp, &target_list, list) { list_del(&nt->list); free_param_target(nt); } } /* * Use late_initcall to ensure netconsole is * initialized after network device driver if built-in. * * late_initcall() and module_init() are identical if built as module. */ late_initcall(init_netconsole); module_exit(cleanup_netconsole); |
| 250 121 113 8 312 312 311 26 248 5 7 65 11 10 460 461 16 2 2 16 461 16 350 454 455 455 455 2 331 332 456 272 272 272 268 221 220 207 272 353 658 649 747 3 129 129 94 2 2 2 2 1 92 5 12 658 747 594 590 749 489 324 495 495 494 324 153 153 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 | // SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * * Generic datagram handling routines. These are generic for all * protocols. Possibly a generic IP version on top of these would * make sense. Not tonight however 8-). * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and * NetROM layer all have identical poll code and mostly * identical recvmsg() code. So we share it here. The poll was * shared before but buried in udp.c so I moved it. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old * udp.c code) * * Fixes: * Alan Cox : NULL return from skb_peek_copy() * understood * Alan Cox : Rewrote skb_read_datagram to avoid the * skb_peek_copy stuff. * Alan Cox : Added support for SOCK_SEQPACKET. * IPX can no longer use the SO_TYPE hack * but AX.25 now works right, and SPX is * feasible. * Alan Cox : Fixed write poll of non IP protocol * crash. * Florian La Roche: Changed for my new skbuff handling. * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. * Linus Torvalds : BSD semantic fixes. * Alan Cox : Datagram iovec handling * Darryl Miles : Fixed non-blocking SOCK_STREAM. * Alan Cox : POSIXisms * Pete Wyckoff : Unconnected accept() fix. * */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/checksum.h> #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include <crypto/hash.h> /* * Is a socket 'connection oriented' ? */ static inline int connection_based(struct sock *sk) { return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { /* * Avoid a wakeup if event not interesting for us */ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) return 0; return autoremove_wake_function(wait, mode, sync, key); } /* * Wait for the last received packet to be different from skb */ int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); if (error) goto out_err; if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ if (sk->sk_shutdown & RCV_SHUTDOWN) goto out_noerr; /* Sequenced packets can come disconnected. * If so we report the problem */ error = -ENOTCONN; if (connection_based(sk) && !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) goto out_err; /* handle signals */ if (signal_pending(current)) goto interrupted; error = 0; *timeo_p = schedule_timeout(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); return error; interrupted: error = sock_intr_errno(*timeo_p); out_err: *err = error; goto out; out_noerr: *err = 0; error = 1; goto out; } EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { struct sk_buff *nskb; if (skb->peeked) return skb; /* We have to unshare an skb before modifying it. */ if (!skb_shared(skb)) goto done; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return ERR_PTR(-ENOMEM); skb->prev->next = nskb; skb->next->prev = nskb; nskb->prev = skb->prev; nskb->next = skb->next; consume_skb(skb); skb = nskb; done: skb->peeked = 1; return skb; } struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { bool peek_at_off = false; struct sk_buff *skb; int _off = 0; if (unlikely(flags & MSG_PEEK && *off >= 0)) { peek_at_off = true; _off = *off; } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { if (peek_at_off && _off >= skb->len && (_off || skb->peeked)) { _off -= skb->len; continue; } if (!skb->len) { skb = skb_set_peeked(skb); if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } } refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); } *off = _off; return skb; } return NULL; } /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @queue: socket queue from which to receive * @flags: MSG\_ flags * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned * @last: set to last peeked message to inform the wait function * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * * This function will lock the socket if a skb is returned, so * the caller needs to unlock the socket in that case (usually by * calling skb_free_datagram). Returns NULL with @err set to * -EAGAIN if no data was available or to some other value if an * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve * * significantly datagram socket latencies at high loads, * * when data copying to user space takes lots of time. * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet * * 8) Great win.) * * --ANK (980729) * * The order of the tests when we find no data waiting are specified * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { struct sk_buff *skb; unsigned long cpu_flags; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ int error = sock_error(sk); if (error) goto no_packet; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. * * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; if (skb) return skb; if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; no_packet: *err = error; return NULL; } EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err) { struct sk_buff *skb, *last; long timeo; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, err, &timeo, last)); return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err) { int off = 0; return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram); void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) { bool slow; if (!skb_unref(skb)) { sk_peek_offset_bwd(sk, len); return; } slow = lock_sock_fast(sk); sk_peek_offset_bwd(sk, len); skb_orphan(skb); unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ __kfree_skb(skb); } EXPORT_SYMBOL(__skb_free_datagram_locked); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { int err = 0; if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } spin_unlock_bh(&sk_queue->lock); } atomic_inc(&sk->sk_drops); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one * used for skb_recv_datagram. * * If the MSG_PEEK flag is set, and the packet is still on the * receive queue of the socket, it will be taken off the queue * before it is freed. * * This function currently only disables BH when acquiring the * sk_receive_queue lock. Therefore it must not be used in a * context where that lock is acquired in an IRQ context. * * It returns 0 if the packet was removed by us. */ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, NULL); kfree_skb(skb); return err; } EXPORT_SYMBOL(skb_kill_datagram); INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i)); static int __skb_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, bool fault_short, size_t (*cb)(const void *, size_t, void *, struct iov_iter *), void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; n = INDIRECT_CALL_1(cb, simple_copy_to_iter, skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; if ((len -= copy) == 0) return 0; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { struct page *page = skb_frag_page(frag); u8 *vaddr = kmap(page); if (copy > len) copy = len; n = INDIRECT_CALL_1(cb, simple_copy_to_iter, vaddr + skb_frag_off(frag) + offset - start, copy, data, to); kunmap(page); offset += n; if (n != copy) goto short_copy; if (!(len -= copy)) return 0; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (__skb_datagram_iter(frag_iter, offset - start, to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; /* This is not really a user copy fault, but rather someone * gave us a bogus length on the skb. We should probably * print a warning here as it may indicate a kernel bug. */ fault: iov_iter_revert(to, offset - start_off); return -EFAULT; short_copy: if (fault_short || iov_iter_count(to)) goto fault; return 0; } static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i) { #ifdef CONFIG_CRYPTO_HASH struct ahash_request *hash = hashp; struct scatterlist sg; size_t copied; copied = copy_to_iter(addr, bytes, i); sg_init_one(&sg, addr, copied); ahash_request_set_crypt(hash, &sg, NULL, copied); crypto_ahash_update(hash); return copied; #else return 0; #endif } /** * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator * and update a hash. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @hash: hash request to update */ int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash) { return __skb_datagram_iter(skb, offset, to, len, true, hash_and_copy_to_iter, hash); } EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) { return copy_to_iter(addr, bytes, i); } /** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec */ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len) { trace_skb_copy_datagram_iovec(skb, len); return __skb_datagram_iter(skb, offset, to, len, false, simple_copy_to_iter, NULL); } EXPORT_SYMBOL(skb_copy_datagram_iter); /** * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. */ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { size_t copied; if (copy > len) copy = len; copied = copy_page_from_iter(skb_frag_page(frag), skb_frag_off(frag) + offset - start, copy, from); if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_datagram_from_iter(frag_iter, offset - start, from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_datagram_from_iter); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { int frag; if (msg && msg->msg_ubuf && msg->sg_from_iter) return msg->sg_from_iter(sk, skb, from, length); frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; int refs, order, n = 0; size_t start; ssize_t copied; unsigned long truesize; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; copied = iov_iter_get_pages2(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; length -= copied; truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); if (!skb_zcopy_pure(skb)) sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } head = compound_head(pages[n]); order = compound_order(head); for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); if (pages[n] - head > (1UL << order) - 1) { head = compound_head(pages[n]); order = compound_order(head); } start += (pages[n] - head) << PAGE_SHIFT; copied -= size; n++; if (frag) { skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; if (head == skb_frag_page(last) && start == skb_frag_off(last) + skb_frag_size(last)) { skb_frag_size_add(last, size); /* We combined this page, we need to release * a reference. Since compound pages refcount * is shared among many pages, batch the refcount * adjustments to limit false sharing. */ last_head = head; refs++; continue; } } if (refs) { page_ref_sub(last_head, refs); refs = 0; } skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); } return 0; } EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. */ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); /* copy up to skb headlen */ if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); } EXPORT_SYMBOL(zerocopy_sg_from_iter); static __always_inline size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_to_user(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; } static __always_inline size_t memcpy_to_iter_csum(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum *csum = priv2; __wsum next = csum_partial_copy_nocheck(from, iter_to, len); *csum = csum_block_add(*csum, next, progress); return 0; } struct csum_state { __wsum csum; size_t off; }; static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { struct csum_state *csstate = _csstate; __wsum sum; if (WARN_ON_ONCE(i->data_source)) return 0; if (unlikely(iov_iter_is_discard(i))) { // can't use csum_memcpy() for that one - data is not copied csstate->csum = csum_block_add(csstate->csum, csum_partial(addr, bytes, 0), csstate->off); csstate->off += bytes; return bytes; } sum = csum_shift(csstate->csum, csstate->off); bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, copy_to_user_iter_csum, memcpy_to_iter_csum); csstate->csum = csum_shift(sum, csstate->off); csstate->off += bytes; return bytes; } /** * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @csump: checksum pointer */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { struct csum_state csdata = { .csum = *csump }; int ret; ret = __skb_datagram_iter(skb, offset, to, len, true, csum_and_copy_to_iter, &csdata); if (ret) return ret; *csump = csdata.csum; return 0; } /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. * -EFAULT - fault during copy. */ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; if (!chunk) return 0; if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) { iov_iter_revert(&msg->msg_iter, chunk); return -EINVAL; } if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(NULL, skb); } return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll * @file: file struct * @sock: socket * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue * is only ever holding data ready to receive. * * Note: when you *don't* use this routine for this protocol, * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ __poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { int state = READ_ONCE(sk->sk_state); if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ if (state == TCP_SYN_SENT) return mask; } /* writable? */ if (sock_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } EXPORT_SYMBOL(datagram_poll); |
| 4417 4419 4418 4418 4418 4418 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002 Andi Kleen, SuSE Labs. * Thanks to Ben LaHaise for precious feedback. */ #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> #include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/memtype.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include "../mm_internal.h" /* * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { unsigned long *vaddr; pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; unsigned long curpage; unsigned long pfn; unsigned int flags; unsigned int force_split : 1, force_static_prot : 1, force_flush_all : 1; struct page **pages; }; enum cpa_warn { CPA_CONFLICT, CPA_PROTECT, CPA_DETECT, }; static const int cpa_warn_level = CPA_PROTECT; /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute. */ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { /* Protect against CPA */ spin_lock(&pgd_lock); direct_pages_count[level] += pages; spin_unlock(&pgd_lock); } static void split_page_count(int level) { if (direct_pages_count[level] == 0) return; direct_pages_count[level]--; if (system_state == SYSTEM_RUNNING) { if (level == PG_LEVEL_2M) count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); else if (level == PG_LEVEL_1G) count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); } direct_pages_count[level - 1] += PTRS_PER_PTE; } void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", direct_pages_count[PG_LEVEL_4K] << 2); #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) seq_printf(m, "DirectMap2M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 11); #else seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); } #else static inline void split_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS static unsigned long cpa_1g_checked; static unsigned long cpa_1g_sameprot; static unsigned long cpa_1g_preserved; static unsigned long cpa_2m_checked; static unsigned long cpa_2m_sameprot; static unsigned long cpa_2m_preserved; static unsigned long cpa_4k_install; static inline void cpa_inc_1g_checked(void) { cpa_1g_checked++; } static inline void cpa_inc_2m_checked(void) { cpa_2m_checked++; } static inline void cpa_inc_4k_install(void) { data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) { if (level == PG_LEVEL_1G) cpa_1g_sameprot++; else cpa_2m_sameprot++; } static inline void cpa_inc_lp_preserved(int level) { if (level == PG_LEVEL_1G) cpa_1g_preserved++; else cpa_2m_preserved++; } static int cpastats_show(struct seq_file *m, void *p) { seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); return 0; } static int cpastats_open(struct inode *inode, struct file *file) { return single_open(file, cpastats_show, NULL); } static const struct file_operations cpastats_fops = { .open = cpastats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init cpa_stats_init(void) { debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, &cpastats_fops); return 0; } late_initcall(cpa_stats_init); #else static inline void cpa_inc_1g_checked(void) { } static inline void cpa_inc_2m_checked(void) { } static inline void cpa_inc_4k_install(void) { } static inline void cpa_inc_lp_sameprot(int level) { } static inline void cpa_inc_lp_preserved(int level) { } #endif static inline int within(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr < end; } static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } #ifdef CONFIG_X86_64 /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): * * 1. The kernel direct map (0xffff880000000000) * 2. The "high kernel map" (0xffffffff81000000) * * We actually execute out of #2. If we get the address of a kernel symbol, it * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { /* Do not reference physical address outside the kernel. */ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* * Kernel text has an alias mapping at a high address, known * here as "highmap". */ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } #else static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* There is no highmap on 32-bit */ return false; } #endif /* * See set_mce_nospec(). * * Machine check recovery code needs to change cache mode of poisoned pages to * UC to avoid speculative access logging another error. But passing the * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a * speculative access. So we cheat and flip the top bit of the address. This * works fine for the code that updates the page tables. But at the end of the * process we need to flush the TLB and cache and the non-canonical address * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. * * But in the common case we already have a canonical address. This code * will fix the top bit if needed and is a no-op otherwise. */ static inline unsigned long fix_addr(unsigned long addr) { #ifdef CONFIG_X86_64 return (long)(addr << 1) >> 1; #else return addr; #endif } static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; if (unlikely(PageHighMem(page))) return 0; return (unsigned long)page_address(page); } if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; return *cpa->vaddr + idx * PAGE_SIZE; } /* * Flushing functions */ static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; if (p >= vend) return; for (; p < vend; p += clflush_size) clflushopt(p); } /** * clflush_cache_range - flush a cache range with clflush * @vaddr: virtual start address * @size: number of bytes to flush * * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or * SFENCE to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { mb(); clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_invalidate_pmem(void *addr, size_t size) { clflush_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION bool cpu_cache_has_invalidate_memregion(void) { return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); int cpu_cache_invalidate_memregion(int res_desc) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; wbinvd_on_all_cpus(); return 0; } EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); #endif static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } static void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; for (i = 0; i < cpa->numpages; i++) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) { struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); else on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, unsigned long r2_start, unsigned long r2_end) { return (r1_start <= r2_end && r1_end >= r2_start) || (r2_start <= r1_end && r2_end >= r1_start); } #ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support. */ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) return _PAGE_NX; return 0; } #else static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { return 0; } #endif /* * The .rodata section needs to be read-only. Using the pfn catches all * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true. */ static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) { unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* * Note: __end_rodata is at page aligned and not inclusive, so * subtract 1 to get the last enforced PFN in the rodata area. */ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) return _PAGE_RW; return 0; } /* * Protect kernel text against becoming non executable by forbidding * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) * out of which the kernel actually executes. Do not protect the low * mapping. * * This does not cover __inittext since that is gone after boot. */ static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)_etext - 1; unsigned long t_start = (unsigned long)_text; if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0; } #if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections * will be always read-only. For the kernel identity mappings covering the * holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data at no * extra cost. */ static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; unsigned long t_start = (unsigned long)_text; unsigned int level; if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if * the current mapping is already using small page mapping. No * need to work hard to preserve large page mappings in this case. * * This also fixes the Linux Xen paravirt guest boot failure caused * by unexpected read-only mappings for kernel identity * mappings. In this paravirt guest case, the kernel text mapping * and the kernel identity mapping share the same page-table pages, * so the protections for kernel text and identity mappings have to * be the same. */ if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0; } #else static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { return 0; } #endif static inline bool conflicts(pgprot_t prot, pgprotval_t val) { return (pgprot_val(prot) & ~val) != pgprot_val(prot); } static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, unsigned long start, unsigned long end, unsigned long pfn, const char *txt) { static const char *lvltxt[] = { [CPA_CONFLICT] = "conflict", [CPA_PROTECT] = "protect", [CPA_DETECT] = "detect", }; if (warnlvl > cpa_warn_level || !conflicts(prot, val)) return; pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), (unsigned long long)val); } /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; /* * There is no point in checking RW/NX conflicts when the requested * mapping is setting the page !PRESENT. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) return prot; /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; res = protect_kernel_text(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; /* * Special case to preserve a large page. If the change spawns the * full large page mapping then there is no point to split it * up. Happens with ftrace and is going to be removed once ftrace * switched to text_poke(). */ if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { res = protect_kernel_text_ro(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); forbidden |= res; } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); forbidden |= res; res = protect_rodata(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); forbidden |= res; return __pgprot(pgprot_val(prot) & ~forbidden); } /* * Validate strict W^X semantics. */ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, unsigned long pfn, unsigned long npg) { unsigned long end; /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable * detection and enforcement there. */ if (IS_ENABLED(CONFIG_X86_32)) return new; /* Only verify when NX is supported: */ if (!(__supported_pte_mask & _PAGE_NX)) return new; if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) return new; if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) return new; end = start + npg * PAGE_SIZE - 1; WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", (unsigned long long)pgprot_val(old), (unsigned long long)pgprot_val(new), start, end, pfn); /* * For now, allow all permission change attempts by returning the * attempted permissions. This can 'return old' to actively * refuse the permission change at a later time. */ return new; } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry and the level of the mapping. */ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { p4d_t *p4d; pud_t *pud; pmd_t *pmd; *level = PG_LEVEL_NONE; if (pgd_none(*pgd)) return NULL; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return NULL; *level = PG_LEVEL_512G; if (p4d_large(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; *level = PG_LEVEL_1G; if (pud_large(*pud) || !pud_present(*pud)) return (pte_t *)pud; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; *level = PG_LEVEL_2M; if (pmd_large(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; return pte_offset_kernel(pmd, address); } /* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * * Note: We return pud and pmd either when the entry is marked large * or when the present bit is not set. Otherwise we would return a * pointer to a nonexisting mapping. */ pte_t *lookup_address(unsigned long address, unsigned int *level) { return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { if (cpa->pgd) return lookup_address_in_pgd(cpa->pgd + pgd_index(address), address, level); return lookup_address(address, level); } /* * Lookup the PMD entry for a virtual address. Return a pointer to the entry * or NULL if not present. */ pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); } /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * * This could be optimized, but it is only intended to be * used at initialization time, and keeping it * unoptimized should increase the testing coverage for * the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; phys_addr_t phys_addr; unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); /* * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t * before being left-shifted PAGE_SHIFT bits -- this trick is to * make 32-PAE kernel work correctly. */ switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); /* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 if (!SHARED_KERNEL_PMD) { struct page *page; list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } } #endif } static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) { /* * _PAGE_GLOBAL means "global page" for present PTEs. * But, it is also used to indicate _PAGE_PROTNONE * for non-present PTEs. * * This ensures that a _PAGE_GLOBAL PTE going from * present to non-present is not confused as * _PAGE_PROTNONE. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) pgprot_val(prot) &= ~_PAGE_GLOBAL; return prot; } static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, *tmp; enum pg_level level; /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); cpa_inc_1g_checked(); break; default: return -EINVAL; } psize = page_level_size(level); pmask = page_level_mask(level); /* * Calculate the number of pages, which fit into this large * page starting at address: */ lpaddr = (address + psize) & pmask; numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * req_prot is in format of 4k pages. It must be converted to large * page format: the caching mode includes the PAT bit located at * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need to add the * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; /* * Calculate the large page base address and the number of 4K pages * in the large page */ lpaddr = address & pmask; numpages = psize >> PAGE_SHIFT; /* * Sanity check that the existing mapping is correct versus the static * protections. static_protections() guards against !PRESENT, so no * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* * Split the large page and tell the split code to * enforce static protections. */ cpa->force_static_prot = 1; return 1; } /* * Optimization: If the requested pgprot is the same as the current * pgprot, then the large page can be preserved and no updates are * required independent of alignment and length of the requested * range. The above already established that the current pgprot is * correct, which in consequence makes the requested pgprot correct * as well if it is the same. The static protection scan below will * not come to a different conclusion. */ if (pgprot_val(req_prot) == pgprot_val(old_prot)) { cpa_inc_lp_sameprot(level); return 0; } /* * If the requested range does not cover the full page, split it up */ if (address != lpaddr || cpa->numpages != numpages) return 1; /* * Check whether the requested pgprot is conflicting with a static * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, psize, CPA_DETECT); new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages); /* * If there is a conflict, split the large page. * * There used to be a 4k wise evaluation trying really hard to * preserve the large pages, but experimentation has shown, that this * does not help at all. There might be corner cases which would * preserve one large page occasionally, but it's really not worth the * extra code and cycles for the common case. */ if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1; /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; cpa_inc_lp_preserved(level); return 0; } static int should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { int do_split; if (cpa->force_split) return 1; spin_lock(&pgd_lock); do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, pgprot_t ref_prot, unsigned long address, unsigned long size) { unsigned int npg = PFN_DOWN(size); pgprot_t prot; /* * If should_split_large_page() discovered an inconsistent mapping, * remove the invalid protection in the split mapping. */ if (!cpa->force_static_prot) goto set; /* Hand in lpsize = 0 to enforce the protection mechanism */ prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; /* * If this is splitting a PMD, fix it up. PUD splits cannot be * fixed trivially as that would require to rescan the newly * installed PMD mappings after returning from split_large_page() * so an eventual further split can allocate the necessary PTE * pages. Warn for now and revisit it in case this actually * happens. */ if (size == PAGE_SIZE) ref_prot = prot; else pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); set: set_pte(pte, pfn_pte(pfn, ref_prot)); } static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); unsigned int i, level; pgprot_t ref_prot; pte_t *tmp; spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: */ tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); /* * Clear PSE (aka _PAGE_PAT) and move * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); ref_pfn = pmd_pfn(*(pmd_t *)kpte); lpaddr = address & PMD_MASK; lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true * even on a non present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; break; default: spin_unlock(&pgd_lock); return 1; } ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); if (pfn_range_is_mapped(pfn, pfn + 1)) split_page_count(level); } /* * Install the new, split up pagetable. * * We use the standard kernel pagetable protections for the new * pagetable protections, the actual ptes set above control the * primary protection behavior: */ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* * Do a global flush tlb after splitting the large page * and before we do the actual change page attribute in the PTE. * * Without this, we violate the TLB application note, that says: * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." * * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry. */ flush_tlb_all(); spin_unlock(&pgd_lock); return 0; } static int split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address) { struct page *base; if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; } static bool try_to_free_pte_page(pte_t *pte) { int i; for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; free_page((unsigned long)pte); return true; } static bool try_to_free_pmd_page(pmd_t *pmd) { int i; for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; free_page((unsigned long)pmd); return true; } static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); while (start < end) { set_pte(pte, __pte(0)); start += PAGE_SIZE; pte++; } if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { if (unmap_pte_range(pmd, start, end)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); /* * Not on a 2MB page boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); __unmap_pmd_range(pud, pmd, start, pre_end); start = pre_end; pmd++; } /* * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { if (pmd_large(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); start += PMD_SIZE; pmd++; } /* * 4K leftovers? */ if (start < end) return __unmap_pmd_range(pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? */ if (start & (PUD_SIZE - 1)) { unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); unmap_pmd_range(pud, start, pre_end); start = pre_end; pud++; } /* * Try to unmap in 1G chunks? */ while (end - start >= PUD_SIZE) { if (pud_large(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; } /* * 2M leftovers? */ if (start < end) unmap_pmd_range(pud, start, end); /* * No need to try to free the PUD page because we'll free it in * populate_pgd's error path */ } static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); return 0; } static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); return 0; } static void populate_pte(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) { pte_t *pte; pte = pte_offset_kernel(pmd, start); pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; cpa->pfn++; pte++; } } static long populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pud_t *pud, pgprot_t pgprot) { long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; /* * Not on a 2M boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long pre_end = start + (num_pages << PAGE_SHIFT); unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; pre_end = min_t(unsigned long, pre_end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(unsigned int, num_pages, cur_pages); /* * Need a PTE page? */ pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); start = pre_end; } /* * We mapped them all? */ if (num_pages == cur_pages) return cur_pages; pmd_pgprot = pgprot_4k_2_large(pgprot); while (end - start >= PMD_SIZE) { /* * We cannot use a 1G page so allocate a PMD page if needed. */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; pmd = pmd_offset(pud, start); set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } /* * Map trailing 4K pages. */ if (start < end) { pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, end, num_pages - cur_pages, pmd, pgprot); } return num_pages; } static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, pgprot_t pgprot) { pud_t *pud; unsigned long end; long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); /* * Not on a Gb page boundary? => map everything up to it with * smaller pages. */ if (start & (PUD_SIZE - 1)) { unsigned long pre_end; unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; pre_end = min_t(unsigned long, end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); pud = pud_offset(p4d, start); /* * Need a PMD page? */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, pud, pgprot); if (cur_pages < 0) return cur_pages; start = pre_end; } /* We mapped them all? */ if (cpa->numpages == cur_pages) return cur_pages; pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } /* Map trailing leftover */ if (start < end) { long tmp; pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, pud, pgprot); if (tmp < 0) return cur_pages; cur_pages += tmp; } return cur_pages; } /* * Restrictions for kernel page table do not necessarily apply when mapping in * an alternate PGD. */ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); } /* * Allocate a PUD page and hand it down for mapping. */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } cpa->numpages = ret; return 0; } static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { if (cpa->pgd) { /* * Right now, we only execute this code path when mapping * the EFI virtual memory map regions, no other users * provide a ->pgd value. This may change in the future. */ return populate_pgd(cpa, vaddr); } /* * Ignore all non primary paths. */ if (!primary) { cpa->numpages = 1; return 0; } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected * to have holes. * Also set numpages to '1' indicating that we processed cpa req for * one virtual address page and its pfn. TBD: numpages can be set based * on the initial value and the level returned by lookup_address(). */ if (within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; } else if (__cpa_pfn_in_highmap(cpa->pfn)) { /* Faults in the highmap are OK, so do not warn: */ return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, *cpa->vaddr); return -EFAULT; } } static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; pgprot_t old_prot = pte_pgprot(old_pte); pgprot_t new_prot = pte_pgprot(old_pte); unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); /* Hand in lpsize = 0 to enforce the protection mechanism */ new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1); new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change its attributes * not the memory it points to */ new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; } /* * Check, whether we can keep the large page intact * and just change the pte: */ do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) return do_split; /* * We have to split the large page: */ err = split_large_page(cpa, kpte, address); if (!err) goto repeat; return err; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); /* * Check the directmap and "high kernel map" 'aliases'. */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); unsigned long vaddr; int ret; if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; /* * No need to redo, when the primary call touched the direct * mapping already: */ vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* Directmap always has NX set, do not modify. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; } #ifdef CONFIG_X86_64 /* * If the primary call didn't touch the high mapping already * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* * [_text, _brk_end) also covers data, do not modify NX except * in cases where the highmap is the primary target. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. */ __change_page_attr_set_clr(&alias_cpa, 0); } #endif return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; int ret = 0; /* * No changes, easy! */ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && !cpa->force_split) return ret; while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; } /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been * preserved or a single page update happened. */ BUG_ON(cpa->numpages > rempages || !cpa->numpages); rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } out: /* Restore the original numpages */ cpa->numpages = numpages; return ret; } static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages) { struct cpa_data cpa; int ret, cache; memset(&cpa, 0, sizeof(cpa)); /* * Check, if we are requested to set a not supported * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { addr[i] &= PAGE_MASK; WARN_ON_ONCE(1); } } } else if (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. * No need to check in that case */ if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; /* * People should not be passing in unaligned addresses: */ WARN_ON_ONCE(1); } } /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: */ if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* * No need to flush, when we did not set any of the caching * attributes: */ cache = !!pgprot2cachemode(mask_set); /* * On error; flush everything to be sure. */ if (ret) { cpa_flush_all(cache); goto out; } cpa_flush(&cpa, cache); out: return ret; } static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, (array ? CPA_ARRAY : 0), NULL); } static inline int cpa_set_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, CPA_PAGES_ARRAY, pages); } static inline int cpa_clear_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, CPA_PAGES_ARRAY, pages); } /* * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. */ int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) { return change_page_attr_set_clr(&addr, numpages, prot, __pgprot(~pgprot_val(prot)), 0, 0, NULL); } int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) { int ret; /* * for now UC MINUS. see comments in ioremap() */ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; ret = _set_memory_uc(addr, numpages); if (ret) goto out_free; return 0; out_free: memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); out_err: return ret; } EXPORT_SYMBOL(set_memory_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { ret = change_page_attr_set_clr(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } return ret; } int set_memory_wc(unsigned long addr, int numpages) { int ret; ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) return ret; ret = _set_memory_wc(addr, numpages); if (ret) memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return ret; } EXPORT_SYMBOL(set_memory_wc); int _set_memory_wt(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) { int ret; ret = _set_memory_wb(addr, numpages); if (ret) return ret; memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return 0; } EXPORT_SYMBOL(set_memory_wb); /* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; /* SGX pages are not in the 1:1 map */ if (arch_is_platform_page(pfn << PAGE_SHIFT)) return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK. */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } static int set_memory_p(unsigned long *addr, int numpages) { return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); } /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); return set_memory_p(&addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_nx(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; return change_page_attr_clear(&addr, numpages, clr, 0); } int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_np_noalias(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, CPA_NO_CHECK_ALIAS, NULL); } int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(0), 1, 0, NULL); } int set_memory_nonglobal(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } int set_memory_global(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } /* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables. */ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) { pgprot_t empty = __pgprot(0); struct cpa_data cpa; int ret; /* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); cpa.pgd = init_mm.pgd; /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); /* Flush the caches as needed before changing the encryption attribute. */ if (x86_platform.guest.enc_tlb_flush_required(enc)) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) return -EIO; ret = __change_page_attr_set_clr(&cpa, 1); /* * After changing the encryption attribute, we need to flush TLBs again * in case any speculative TLB caching occurred (but no need to flush * caches again). We could just use cpa_flush_all(), but in case TLB * flushing gets optimized in the cpa_flush() path use the same logic * as above. */ cpa_flush(&cpa, 0); /* Notify hypervisor that we have successfully set/clr encryption attribute. */ if (!ret) { if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) ret = -EIO; } return ret; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return __set_memory_enc_pgtable(addr, numpages, enc); return 0; } int set_memory_encrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, true); } EXPORT_SYMBOL_GPL(set_memory_encrypted); int set_memory_decrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, false); } EXPORT_SYMBOL_GPL(set_memory_decrypted); int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; unsigned long end; enum page_cache_mode set_type; int i; int free_idx; int ret; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (memtype_reserve(start, end, new_type, NULL)) goto err_out; } /* If WC, set to UC- first and then WC */ set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) goto err_out; return 0; /* Success */ err_out: free_idx = i; for (i = 0; i < free_idx; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return -EINVAL; } int set_pages_array_uc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_pages_wb); int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; unsigned long end; int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return 0; } EXPORT_SYMBOL(set_pages_array_wb); int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_ro(addr, numpages); } int set_pages_rw(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_rw(addr, numpages); } static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) { return __set_pages_np(page, 1); } int set_direct_map_default_noflush(struct page *page) { return __set_pages_p(page, 1); } #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; if (!enable) { debug_check_no_locks_freed(page_address(page), numpages * PAGE_SIZE); } /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); else __set_pages_np(page, numpages); /* * We should perform an IPI and flush all tlbs, * but that can deadlock->flush only current cpu. * Preemption needs to be disabled around __flush_tlb_all() due to * CR3 reload in __native_flush_tlb(). */ preempt_disable(); __flush_tlb_all(); preempt_enable(); arch_flush_lazy_mmu_mode(); } #endif /* CONFIG_DEBUG_PAGEALLOC */ bool kernel_page_present(struct page *page) { unsigned int level; pte_t *pte; if (PageHighMem(page)) return false; pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { int retval = -EINVAL; struct cpa_data cpa = { .vaddr = &address, .pfn = pfn, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); if (!(__supported_pte_mask & _PAGE_NX)) goto out; if (!(page_flags & _PAGE_ENC)) cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: return retval; } /* * __flush_tlb_all() flushes mappings only on current CPU and hence this * function shouldn't be used in an SMP environment. Presently, it's used only * during boot (way before smp_init()) by EFI subsystem and hence is ok. */ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages) { int retval; /* * The typical sequence for unmapping is to find a pte through * lookup_address_in_pgd() (ideally, it should never return NULL because * the address is already mapped) and change its protections. As pfn is * the *target* of a mapping, it's not useful while unmapping. */ struct cpa_data cpa = { .vaddr = &address, .pfn = 0, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; } /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. */ #ifdef CONFIG_CPA_DEBUG #include "cpa-test.c" #endif |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0-or-later /* * connector.c * * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> * All rights reserved. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <linux/moduleparam.h> #include <linux/connector.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_CONNECTOR); static struct cn_dev cdev; static int cn_already_initialized; /* * Sends mult (multiple) cn_msg at a time. * * msg->seq and msg->ack are used to determine message genealogy. * When someone sends message it puts there locally unique sequence * and random acknowledge numbers. Sequence number may be copied into * nlmsghdr->nlmsg_seq too. * * Sequence number is incremented with each message to be sent. * * If we expect a reply to our message then the sequence number in * received message MUST be the same as in original message, and * acknowledge number MUST be the same + 1. * * If we receive a message and its sequence number is not equal to the * one we are expecting then it is a new message. * * If we receive a message and its sequence number is the same as one * we are expecting but it's acknowledgement number is not equal to * the acknowledgement number in the original message + 1, then it is * a new message. * * If msg->len != len, then additional cn_msg messages are expected following * the first msg. * * The message is sent to, the portid if given, the group if given, both if * both, or if both are zero then the group is looked up and sent there. */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, gfp_t gfp_mask, netlink_filter_fn filter, void *filter_data) { struct cn_callback_entry *__cbq; unsigned int size; struct sk_buff *skb; struct nlmsghdr *nlh; struct cn_msg *data; struct cn_dev *dev = &cdev; u32 group = 0; int found = 0; if (portid || __group) { group = __group; } else { spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&__cbq->id.id, &msg->id)) { found = 1; group = __cbq->group; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (!found) return -ENODEV; } if (!portid && !netlink_has_listeners(dev->nls, group)) return -ESRCH; size = sizeof(*msg) + len; skb = nlmsg_new(size, gfp_mask); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, msg->seq, NLMSG_DONE, size, 0); if (!nlh) { kfree_skb(skb); return -EMSGSIZE; } data = nlmsg_data(nlh); memcpy(data, msg, size); NETLINK_CB(skb).dst_group = group; if (group) return netlink_broadcast_filtered(dev->nls, skb, portid, group, gfp_mask, filter, (void *)filter_data); return netlink_unicast(dev->nls, skb, portid, !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); /* same as cn_netlink_send_mult except msg->len is used for len */ int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, gfp_t gfp_mask) { return cn_netlink_send_mult(msg, msg->len, portid, __group, gfp_mask, NULL, NULL); } EXPORT_SYMBOL_GPL(cn_netlink_send); /* * Callback helper - queues work and setup destructor for given data. */ static int cn_call_callback(struct sk_buff *skb) { struct nlmsghdr *nlh; struct cn_callback_entry *i, *cbq = NULL; struct cn_dev *dev = &cdev; struct cn_msg *msg = nlmsg_data(nlmsg_hdr(skb)); struct netlink_skb_parms *nsp = &NETLINK_CB(skb); int err = -ENODEV; /* verify msg->len is within skb */ nlh = nlmsg_hdr(skb); if (nlh->nlmsg_len < NLMSG_HDRLEN + sizeof(struct cn_msg) + msg->len) return -EINVAL; spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&i->id.id, &msg->id)) { refcount_inc(&i->refcnt); cbq = i; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (cbq != NULL) { cbq->callback(msg, nsp); kfree_skb(skb); cn_queue_release_callback(cbq); err = 0; } return err; } /* * Allow non-root access for NETLINK_CONNECTOR family having CN_IDX_PROC * multicast group. */ static int cn_bind(struct net *net, int group) { unsigned long groups = (unsigned long) group; if (ns_capable(net->user_ns, CAP_NET_ADMIN)) return 0; if (test_bit(CN_IDX_PROC - 1, &groups)) return 0; return -EPERM; } static void cn_release(struct sock *sk, unsigned long *groups) { if (groups && test_bit(CN_IDX_PROC - 1, groups)) { kfree(sk->sk_user_data); sk->sk_user_data = NULL; } } /* * Main netlink receiving function. * * It checks skb, netlink header and msg sizes, and calls callback helper. */ static void cn_rx_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; int len, err; if (skb->len >= NLMSG_HDRLEN) { nlh = nlmsg_hdr(skb); len = nlmsg_len(nlh); if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || len > CONNECTOR_MAX_MSG_SIZE) return; err = cn_call_callback(skb_get(skb)); if (err < 0) kfree_skb(skb); } } /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. * * May sleep. */ int cn_add_callback(const struct cb_id *id, const char *name, void (*callback)(struct cn_msg *, struct netlink_skb_parms *)) { struct cn_dev *dev = &cdev; if (!cn_already_initialized) return -EAGAIN; return cn_queue_add_callback(dev->cbdev, name, id, callback); } EXPORT_SYMBOL_GPL(cn_add_callback); /* * Callback remove routing - removes callback * with given ID. * If there is no registered callback with given * ID nothing happens. * * May sleep while waiting for reference counter to become zero. */ void cn_del_callback(const struct cb_id *id) { struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); } EXPORT_SYMBOL_GPL(cn_del_callback); static int __maybe_unused cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; struct cn_callback_entry *cbq; seq_printf(m, "Name ID\n"); spin_lock_bh(&dev->queue_lock); list_for_each_entry(cbq, &dev->queue_list, callback_entry) { seq_printf(m, "%-15s %u:%u\n", cbq->id.name, cbq->id.id.idx, cbq->id.id.val); } spin_unlock_bh(&dev->queue_lock); return 0; } static int cn_init(void) { struct cn_dev *dev = &cdev; struct netlink_kernel_cfg cfg = { .groups = CN_NETLINK_USERS + 0xf, .input = cn_rx_skb, .flags = NL_CFG_F_NONROOT_RECV, .bind = cn_bind, .release = cn_release, }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); if (!dev->nls) return -EIO; dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls); if (!dev->cbdev) { netlink_kernel_release(dev->nls); return -EINVAL; } cn_already_initialized = 1; proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show); return 0; } static void cn_fini(void) { struct cn_dev *dev = &cdev; cn_already_initialized = 0; remove_proc_entry("connector", init_net.proc_net); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } subsys_initcall(cn_init); module_exit(cn_fini); |
| 1711 1711 1708 1709 1710 1705 1710 1709 1711 1708 1711 1711 1709 1710 1704 1707 1710 1711 1709 1708 1710 1705 1710 1711 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 | // SPDX-License-Identifier: GPL-2.0+ /* * Base port operations for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * Split from 8250_core.c, Copyright (C) 2001 Russell King. * * A note about mapbase / membase * * mapbase is the physical address of the IO port. * membase is an 'ioremapped' cookie. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/console.h> #include <linux/gpio/consumer.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/platform_device.h> #include <linux/tty.h> #include <linux/ratelimit.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/nmi.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> #include <linux/ktime.h> #include <asm/io.h> #include <asm/irq.h> #include "8250.h" /* Nuvoton NPCM timeout register */ #define UART_NPCM_TOR 7 #define UART_NPCM_TOIE BIT(7) /* Timeout Interrupt Enable */ /* * Debugging. */ #if 0 #define DEBUG_AUTOCONF(fmt...) printk(fmt) #else #define DEBUG_AUTOCONF(fmt...) do { } while (0) #endif /* * Here we define the default xmit fifo size used for each type of UART. */ static const struct serial8250_config uart_config[] = { [PORT_UNKNOWN] = { .name = "unknown", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_8250] = { .name = "8250", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16450] = { .name = "16450", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16550] = { .name = "16550", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16550A] = { .name = "16550A", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_CIRRUS] = { .name = "Cirrus", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16650] = { .name = "ST16650", .fifo_size = 1, .tx_loadsz = 1, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16650V2] = { .name = "ST16650V2", .fifo_size = 32, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_00, .rxtrig_bytes = {8, 16, 24, 28}, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16750] = { .name = "TI16750", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR7_64BYTE, .rxtrig_bytes = {1, 16, 32, 56}, .flags = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE, }, [PORT_STARTECH] = { .name = "Startech", .fifo_size = 1, .tx_loadsz = 1, }, [PORT_16C950] = { .name = "16C950/954", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {16, 32, 112, 120}, /* UART_CAP_EFR breaks billionon CF bluetooth card. */ .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, [PORT_16654] = { .name = "ST16654", .fifo_size = 64, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_10, .rxtrig_bytes = {8, 16, 56, 60}, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_16850] = { .name = "XR16850", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_RSA] = { .name = "RSA", .fifo_size = 2048, .tx_loadsz = 2048, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11, .flags = UART_CAP_FIFO, }, [PORT_NS16550A] = { .name = "NS16550A", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_NATSEMI, }, [PORT_XSCALE] = { .name = "XScale", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE, }, [PORT_OCTEON] = { .name = "OCTEON", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO, }, [PORT_U6_16550A] = { .name = "U6_16550A", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_TEGRA] = { .name = "Tegra", .fifo_size = 32, .tx_loadsz = 8, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 | UART_FCR_T_TRIG_01, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO | UART_CAP_RTOIE, }, [PORT_XR17D15X] = { .name = "XR17D15X", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_XR17V35X] = { .name = "XR17V35X", .fifo_size = 256, .tx_loadsz = 256, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 | UART_FCR_T_TRIG_11, .flags = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR | UART_CAP_SLEEP, }, [PORT_LPC3220] = { .name = "LPC3220", .fifo_size = 64, .tx_loadsz = 32, .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00, .flags = UART_CAP_FIFO, }, [PORT_BRCM_TRUMANAGE] = { .name = "TruManage", .fifo_size = 1, .tx_loadsz = 1024, .flags = UART_CAP_HFIFO, }, [PORT_8250_CIR] = { .name = "CIR port" }, [PORT_ALTR_16550_F32] = { .name = "Altera 16550 FIFO32", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 8, 16, 30}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_ALTR_16550_F64] = { .name = "Altera 16550 FIFO64", .fifo_size = 64, .tx_loadsz = 64, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 16, 32, 62}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_ALTR_16550_F128] = { .name = "Altera 16550 FIFO128", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 32, 64, 126}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, /* * tx_loadsz is set to 63-bytes instead of 64-bytes to implement * workaround of errata A-008006 which states that tx_loadsz should * be configured less than Maximum supported fifo bytes. */ [PORT_16550A_FSL64] = { .name = "16550A_FSL64", .fifo_size = 64, .tx_loadsz = 63, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR7_64BYTE, .flags = UART_CAP_FIFO | UART_CAP_NOTEMT, }, [PORT_RT2880] = { .name = "Palmchip BK-3103", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_DA830] = { .name = "TI DA8xx/66AK2x", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, [PORT_MTK_BTIF] = { .name = "MediaTek BTIF", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, .flags = UART_CAP_FIFO, }, [PORT_NPCM] = { .name = "Nuvoton 16550", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_SUNIX] = { .name = "Sunix", .fifo_size = 128, .tx_loadsz = 128, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10, .rxtrig_bytes = {1, 32, 64, 112}, .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, [PORT_ASPEED_VUART] = { .name = "ASPEED VUART", .fifo_size = 16, .tx_loadsz = 16, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, .rxtrig_bytes = {1, 4, 8, 14}, .flags = UART_CAP_FIFO, }, [PORT_MCHP16550A] = { .name = "MCHP16550A", .fifo_size = 256, .tx_loadsz = 256, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {2, 66, 130, 194}, .flags = UART_CAP_FIFO, }, [PORT_BCM7271] = { .name = "Broadcom BCM7271 UART", .fifo_size = 32, .tx_loadsz = 32, .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01, .rxtrig_bytes = {1, 8, 16, 30}, .flags = UART_CAP_FIFO | UART_CAP_AFE, }, }; /* Uart divisor latch read */ static u32 default_serial_dl_read(struct uart_8250_port *up) { /* Assign these in pieces to truncate any bits above 7. */ unsigned char dll = serial_in(up, UART_DLL); unsigned char dlm = serial_in(up, UART_DLM); return dll | dlm << 8; } /* Uart divisor latch write */ static void default_serial_dl_write(struct uart_8250_port *up, u32 value) { serial_out(up, UART_DLL, value & 0xff); serial_out(up, UART_DLM, value >> 8 & 0xff); } static unsigned int hub6_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; outb(p->hub6 - 1 + offset, p->iobase); return inb(p->iobase + 1); } static void hub6_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; outb(p->hub6 - 1 + offset, p->iobase); outb(value, p->iobase + 1); } static unsigned int mem_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readb(p->membase + offset); } static void mem_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writeb(value, p->membase + offset); } static void mem16_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writew(value, p->membase + offset); } static unsigned int mem16_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readw(p->membase + offset); } static void mem32_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; writel(value, p->membase + offset); } static unsigned int mem32_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return readl(p->membase + offset); } static void mem32be_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; iowrite32be(value, p->membase + offset); } static unsigned int mem32be_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return ioread32be(p->membase + offset); } static unsigned int io_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; return inb(p->iobase + offset); } static void io_serial_out(struct uart_port *p, int offset, int value) { offset = offset << p->regshift; outb(value, p->iobase + offset); } static int serial8250_default_handle_irq(struct uart_port *port); static void set_io_from_upio(struct uart_port *p) { struct uart_8250_port *up = up_to_u8250p(p); up->dl_read = default_serial_dl_read; up->dl_write = default_serial_dl_write; switch (p->iotype) { case UPIO_HUB6: p->serial_in = hub6_serial_in; p->serial_out = hub6_serial_out; break; case UPIO_MEM: p->serial_in = mem_serial_in; p->serial_out = mem_serial_out; break; case UPIO_MEM16: p->serial_in = mem16_serial_in; p->serial_out = mem16_serial_out; break; case UPIO_MEM32: p->serial_in = mem32_serial_in; p->serial_out = mem32_serial_out; break; case UPIO_MEM32BE: p->serial_in = mem32be_serial_in; p->serial_out = mem32be_serial_out; break; default: p->serial_in = io_serial_in; p->serial_out = io_serial_out; break; } /* Remember loaded iotype */ up->cur_iotype = p->iotype; p->handle_irq = serial8250_default_handle_irq; } static void serial_port_out_sync(struct uart_port *p, int offset, int value) { switch (p->iotype) { case UPIO_MEM: case UPIO_MEM16: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_AU: p->serial_out(p, offset, value); p->serial_in(p, UART_LCR); /* safe, no side-effects */ break; default: p->serial_out(p, offset, value); } } /* * FIFO support. */ static void serial8250_clear_fifos(struct uart_8250_port *p) { if (p->capabilities & UART_CAP_FIFO) { serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); serial_out(p, UART_FCR, 0); } } static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t); static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t); void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p) { serial8250_clear_fifos(p); serial_out(p, UART_FCR, p->fcr); } EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos); void serial8250_rpm_get(struct uart_8250_port *p) { if (!(p->capabilities & UART_CAP_RPM)) return; pm_runtime_get_sync(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_get); void serial8250_rpm_put(struct uart_8250_port *p) { if (!(p->capabilities & UART_CAP_RPM)) return; pm_runtime_mark_last_busy(p->port.dev); pm_runtime_put_autosuspend(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_put); /** * serial8250_em485_init() - put uart_8250_port into rs485 emulating * @p: uart_8250_port port instance * * The function is used to start rs485 software emulating on the * &struct uart_8250_port* @p. Namely, RTS is switched before/after * transmission. The function is idempotent, so it is safe to call it * multiple times. * * The caller MUST enable interrupt on empty shift register before * calling serial8250_em485_init(). This interrupt is not a part of * 8250 standard, but implementation defined. * * The function is supposed to be called from .rs485_config callback * or from any other callback protected with p->port.lock spinlock. * * See also serial8250_em485_destroy() * * Return 0 - success, -errno - otherwise */ static int serial8250_em485_init(struct uart_8250_port *p) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); if (p->em485) goto deassert_rts; p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC); if (!p->em485) return -ENOMEM; hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; p->em485->port = p; p->em485->active_timer = NULL; p->em485->tx_stopped = true; deassert_rts: if (p->em485->tx_stopped) p->rs485_stop_tx(p); return 0; } /** * serial8250_em485_destroy() - put uart_8250_port into normal state * @p: uart_8250_port port instance * * The function is used to stop rs485 software emulating on the * &struct uart_8250_port* @p. The function is idempotent, so it is safe to * call it multiple times. * * The function is supposed to be called from .rs485_config callback * or from any other callback protected with p->port.lock spinlock. * * See also serial8250_em485_init() */ void serial8250_em485_destroy(struct uart_8250_port *p) { if (!p->em485) return; hrtimer_cancel(&p->em485->start_tx_timer); hrtimer_cancel(&p->em485->stop_tx_timer); kfree(p->em485); p->em485 = NULL; } EXPORT_SYMBOL_GPL(serial8250_em485_destroy); struct serial_rs485 serial8250_em485_supported = { .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; EXPORT_SYMBOL_GPL(serial8250_em485_supported); /** * serial8250_em485_config() - generic ->rs485_config() callback * @port: uart port * @termios: termios structure * @rs485: rs485 settings * * Generic callback usable by 8250 uart drivers to activate rs485 settings * if the uart is incapable of driving RTS as a Transmit Enable signal in * hardware, relying on software emulation instead. */ int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485) { struct uart_8250_port *up = up_to_u8250p(port); /* pick sane settings if the user hasn't */ if (!!(rs485->flags & SER_RS485_RTS_ON_SEND) == !!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) { rs485->flags |= SER_RS485_RTS_ON_SEND; rs485->flags &= ~SER_RS485_RTS_AFTER_SEND; } /* * Both serial8250_em485_init() and serial8250_em485_destroy() * are idempotent. */ if (rs485->flags & SER_RS485_ENABLED) return serial8250_em485_init(up); serial8250_em485_destroy(up); return 0; } EXPORT_SYMBOL_GPL(serial8250_em485_config); /* * These two wrappers ensure that enable_runtime_pm_tx() can be called more than * once and disable_runtime_pm_tx() will still disable RPM because the fifo is * empty and the HW can idle again. */ void serial8250_rpm_get_tx(struct uart_8250_port *p) { unsigned char rpm_active; if (!(p->capabilities & UART_CAP_RPM)) return; rpm_active = xchg(&p->rpm_tx_active, 1); if (rpm_active) return; pm_runtime_get_sync(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx); void serial8250_rpm_put_tx(struct uart_8250_port *p) { unsigned char rpm_active; if (!(p->capabilities & UART_CAP_RPM)) return; rpm_active = xchg(&p->rpm_tx_active, 0); if (!rpm_active) return; pm_runtime_mark_last_busy(p->port.dev); pm_runtime_put_autosuspend(p->port.dev); } EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx); /* * IER sleep support. UARTs which have EFRs need the "extended * capability" bit enabled. Note that on XR16C850s, we need to * reset LCR to write to IER. */ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) { unsigned char lcr = 0, efr = 0; serial8250_rpm_get(p); if (p->capabilities & UART_CAP_SLEEP) { /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(&p->port); if (p->capabilities & UART_CAP_EFR) { lcr = serial_in(p, UART_LCR); efr = serial_in(p, UART_EFR); serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); serial_out(p, UART_LCR, lcr); } uart_port_unlock_irq(&p->port); } serial8250_rpm_put(p); } static void serial8250_clear_IER(struct uart_8250_port *up) { if (up->capabilities & UART_CAP_UUE) serial_out(up, UART_IER, UART_IER_UUE); else serial_out(up, UART_IER, 0); } #ifdef CONFIG_SERIAL_8250_RSA /* * Attempts to turn on the RSA FIFO. Returns zero on failure. * We set the port uart clock rate if we succeed. */ static int __enable_rsa(struct uart_8250_port *up) { unsigned char mode; int result; mode = serial_in(up, UART_RSA_MSR); result = mode & UART_RSA_MSR_FIFO; if (!result) { serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO); mode = serial_in(up, UART_RSA_MSR); result = mode & UART_RSA_MSR_FIFO; } if (result) up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16; return result; } static void enable_rsa(struct uart_8250_port *up) { if (up->port.type == PORT_RSA) { if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) { uart_port_lock_irq(&up->port); __enable_rsa(up); uart_port_unlock_irq(&up->port); } if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) serial_out(up, UART_RSA_FRR, 0); } } /* * Attempts to turn off the RSA FIFO. Returns zero on failure. * It is unknown why interrupts were disabled in here. However, * the caller is expected to preserve this behaviour by grabbing * the spinlock before calling this function. */ static void disable_rsa(struct uart_8250_port *up) { unsigned char mode; int result; if (up->port.type == PORT_RSA && up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) { uart_port_lock_irq(&up->port); mode = serial_in(up, UART_RSA_MSR); result = !(mode & UART_RSA_MSR_FIFO); if (!result) { serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO); mode = serial_in(up, UART_RSA_MSR); result = !(mode & UART_RSA_MSR_FIFO); } if (result) up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; uart_port_unlock_irq(&up->port); } } #endif /* CONFIG_SERIAL_8250_RSA */ /* * This is a quickie test to see how big the FIFO is. * It doesn't work at all the time, more's the pity. */ static int size_fifo(struct uart_8250_port *up) { unsigned char old_fcr, old_mcr, old_lcr; u32 old_dl; int count; old_lcr = serial_in(up, UART_LCR); serial_out(up, UART_LCR, 0); old_fcr = serial_in(up, UART_FCR); old_mcr = serial8250_in_MCR(up); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT); serial8250_out_MCR(up, UART_MCR_LOOP); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); old_dl = serial_dl_read(up); serial_dl_write(up, 0x0001); serial_out(up, UART_LCR, UART_LCR_WLEN8); for (count = 0; count < 256; count++) serial_out(up, UART_TX, count); mdelay(20);/* FIXME - schedule_timeout */ for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) && (count < 256); count++) serial_in(up, UART_RX); serial_out(up, UART_FCR, old_fcr); serial8250_out_MCR(up, old_mcr); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); serial_dl_write(up, old_dl); serial_out(up, UART_LCR, old_lcr); return count; } /* * Read UART ID using the divisor method - set DLL and DLM to zero * and the revision will be in DLL and device type in DLM. We * preserve the device state across this. */ static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p) { unsigned char old_lcr; unsigned int id, old_dl; old_lcr = serial_in(p, UART_LCR); serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A); old_dl = serial_dl_read(p); serial_dl_write(p, 0); id = serial_dl_read(p); serial_dl_write(p, old_dl); serial_out(p, UART_LCR, old_lcr); return id; } /* * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's. * When this function is called we know it is at least a StarTech * 16650 V2, but it might be one of several StarTech UARTs, or one of * its clones. (We treat the broken original StarTech 16650 V1 as a * 16550, and why not? Startech doesn't seem to even acknowledge its * existence.) * * What evil have men's minds wrought... */ static void autoconfig_has_efr(struct uart_8250_port *up) { unsigned int id1, id2, id3, rev; /* * Everything with an EFR has SLEEP */ up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; /* * First we check to see if it's an Oxford Semiconductor UART. * * If we have to do this here because some non-National * Semiconductor clone chips lock up if you try writing to the * LSR register (which serial_icr_read does) */ /* * Check for Oxford Semiconductor 16C950. * * EFR [4] must be set else this test fails. * * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca) * claims that it's needed for 952 dual UART's (which are not * recommended for new designs). */ up->acr = 0; serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, UART_EFR_ECB); serial_out(up, UART_LCR, 0x00); id1 = serial_icr_read(up, UART_ID1); id2 = serial_icr_read(up, UART_ID2); id3 = serial_icr_read(up, UART_ID3); rev = serial_icr_read(up, UART_REV); DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev); if (id1 == 0x16 && id2 == 0xC9 && (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { up->port.type = PORT_16C950; /* * Enable work around for the Oxford Semiconductor 952 rev B * chip which causes it to seriously miscalculate baud rates * when DLL is 0. */ if (id3 == 0x52 && rev == 0x01) up->bugs |= UART_BUG_QUOT; return; } /* * We check for a XR16C850 by setting DLL and DLM to 0, and then * reading back DLL and DLM. The chip type depends on the DLM * value read back: * 0x10 - XR16C850 and the DLL contains the chip revision. * 0x12 - XR16C2850. * 0x14 - XR16C854. */ id1 = autoconfig_read_divisor_id(up); DEBUG_AUTOCONF("850id=%04x ", id1); id2 = id1 >> 8; if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { up->port.type = PORT_16850; return; } /* * It wasn't an XR16C850. * * We distinguish between the '654 and the '650 by counting * how many bytes are in the FIFO. I'm using this for now, * since that's the technique that was sent to me in the * serial driver update, but I'm not convinced this works. * I've had problems doing this in the past. -TYT */ if (size_fifo(up) == 64) up->port.type = PORT_16654; else up->port.type = PORT_16650V2; } /* * We detected a chip without a FIFO. Only two fall into * this category - the original 8250 and the 16450. The * 16450 has a scratch register (accessible with LCR=0) */ static void autoconfig_8250(struct uart_8250_port *up) { unsigned char scratch, status1, status2; up->port.type = PORT_8250; scratch = serial_in(up, UART_SCR); serial_out(up, UART_SCR, 0xa5); status1 = serial_in(up, UART_SCR); serial_out(up, UART_SCR, 0x5a); status2 = serial_in(up, UART_SCR); serial_out(up, UART_SCR, scratch); if (status1 == 0xa5 && status2 == 0x5a) up->port.type = PORT_16450; } static int broken_efr(struct uart_8250_port *up) { /* * Exar ST16C2550 "A2" devices incorrectly detect as * having an EFR, and report an ID of 0x0201. See * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html */ if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16) return 1; return 0; } /* * We know that the chip has FIFOs. Does it have an EFR? The * EFR is located in the same register position as the IIR and * we know the top two bits of the IIR are currently set. The * EFR should contain zero. Try to read the EFR. */ static void autoconfig_16550a(struct uart_8250_port *up) { unsigned char status1, status2; unsigned int iersave; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); up->port.type = PORT_16550A; up->capabilities |= UART_CAP_FIFO; if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) && !(up->port.flags & UPF_FULL_PROBE)) return; /* * Check for presence of the EFR when DLAB is set. * Only ST16C650V1 UARTs pass this test. */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); if (serial_in(up, UART_EFR) == 0) { serial_out(up, UART_EFR, 0xA8); if (serial_in(up, UART_EFR) != 0) { DEBUG_AUTOCONF("EFRv1 "); up->port.type = PORT_16650; up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP; } else { serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, 0); serial_out(up, UART_LCR, 0); if (status1 == UART_IIR_FIFO_ENABLED_16750) up->port.type = PORT_16550A_FSL64; else DEBUG_AUTOCONF("Motorola 8xxx DUART "); } serial_out(up, UART_EFR, 0); return; } /* * Maybe it requires 0xbf to be written to the LCR. * (other ST16C650V2 UARTs, TI16C752A, etc) */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) { DEBUG_AUTOCONF("EFRv2 "); autoconfig_has_efr(up); return; } /* * Check for a National Semiconductor SuperIO chip. * Attempt to switch to bank 2, read the value of the LOOP bit * from EXCR1. Switch back to bank 0, change it in MCR. Then * switch back to bank 2, read it from EXCR1 again and check * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2 */ serial_out(up, UART_LCR, 0); status1 = serial8250_in_MCR(up); serial_out(up, UART_LCR, 0xE0); status2 = serial_in(up, 0x02); /* EXCR1 */ if (!((status2 ^ status1) & UART_MCR_LOOP)) { serial_out(up, UART_LCR, 0); serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP); serial_out(up, UART_LCR, 0xE0); status2 = serial_in(up, 0x02); /* EXCR1 */ serial_out(up, UART_LCR, 0); serial8250_out_MCR(up, status1); if ((status2 ^ status1) & UART_MCR_LOOP) { unsigned short quot; serial_out(up, UART_LCR, 0xE0); quot = serial_dl_read(up); quot <<= 3; if (ns16550a_goto_highspeed(up)) serial_dl_write(up, quot); serial_out(up, UART_LCR, 0); up->port.uartclk = 921600*16; up->port.type = PORT_NS16550A; up->capabilities |= UART_NATSEMI; return; } } /* * No EFR. Try to detect a TI16750, which only sets bit 5 of * the IIR when 64 byte FIFO mode is enabled when DLAB is set. * Try setting it with and without DLAB set. Cheap clones * set bit 5 without DLAB set. */ serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE); status2 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750; serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); serial_out(up, UART_LCR, 0); DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2); if (status1 == UART_IIR_FIFO_ENABLED_16550A && status2 == UART_IIR_FIFO_ENABLED_16750) { up->port.type = PORT_16750; up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP; return; } /* * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's * internal UARTs. * We're going to explicitly set the UUE bit to 0 before * trying to write and read a 1 just to make sure it's not * already a 1 and maybe locked there before we even start. */ iersave = serial_in(up, UART_IER); serial_out(up, UART_IER, iersave & ~UART_IER_UUE); if (!(serial_in(up, UART_IER) & UART_IER_UUE)) { /* * OK it's in a known zero state, try writing and reading * without disturbing the current state of the other bits. */ serial_out(up, UART_IER, iersave | UART_IER_UUE); if (serial_in(up, UART_IER) & UART_IER_UUE) { /* * It's an Xscale. * We'll leave the UART_IER_UUE bit set to 1 (enabled). */ DEBUG_AUTOCONF("Xscale "); up->port.type = PORT_XSCALE; up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE; return; } } else { /* * If we got here we couldn't force the IER_UUE bit to 0. * Log it and continue. */ DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 "); } serial_out(up, UART_IER, iersave); /* * We distinguish between 16550A and U6 16550A by counting * how many bytes are in the FIFO. */ if (up->port.type == PORT_16550A && size_fifo(up) == 64) { up->port.type = PORT_U6_16550A; up->capabilities |= UART_CAP_AFE; } } /* * This routine is called by rs_init() to initialize a specific serial * port. It determines what type of UART chip this serial port is * using: 8250, 16450, 16550, 16550A. The important question is * whether or not this UART is a 16550A or not, since this will * determine whether or not we can use its FIFO features or not. */ static void autoconfig(struct uart_8250_port *up) { unsigned char status1, scratch, scratch2, scratch3; unsigned char save_lcr, save_mcr; struct uart_port *port = &up->port; unsigned long flags; unsigned int old_capabilities; if (!port->iobase && !port->mapbase && !port->membase) return; DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ", port->name, port->iobase, port->membase); /* * We really do need global IRQs disabled here - we're going to * be frobbing the chips IRQ enable register to see if it exists. * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->capabilities = 0; up->bugs = 0; if (!(port->flags & UPF_BUGGY_UART)) { /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. * * 0x80 is used as a nonsense port to prevent against * false positives due to ISA bus float. The * assumption is that 0x80 is a non-existent port; * which should be safe since include/asm/io.h also * makes this assumption. * * Note: this is safe as long as MCR bit 4 is clear * and the device is in "PC" mode. */ scratch = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); #ifdef __i386__ outb(0xff, 0x080); #endif /* * Mask out IER[7:4] bits for test as some UARTs (e.g. TL * 16C754B) allow only to modify them if an EFR bit is set. */ scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR; serial_out(up, UART_IER, UART_IER_ALL_INTR); #ifdef __i386__ outb(0, 0x080); #endif scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR; serial_out(up, UART_IER, scratch); if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) { /* * We failed; there's nothing here */ uart_port_unlock_irqrestore(port, flags); DEBUG_AUTOCONF("IER test failed (%02x, %02x) ", scratch2, scratch3); goto out; } } save_mcr = serial8250_in_MCR(up); save_lcr = serial_in(up, UART_LCR); /* * Check to see if a UART is really there. Certain broken * internal modems based on the Rockwell chipset fail this * test, because they apparently don't implement the loopback * test mode. So this test is skipped on the COM 1 through * COM 4 ports. This *should* be safe, since no board * manufacturer would be stupid enough to design a board * that conflicts with COM 1-4 --- we hope! */ if (!(port->flags & UPF_SKIP_TEST)) { serial8250_out_MCR(up, UART_MCR_LOOP | UART_MCR_OUT2 | UART_MCR_RTS); status1 = serial_in(up, UART_MSR) & UART_MSR_STATUS_BITS; serial8250_out_MCR(up, save_mcr); if (status1 != (UART_MSR_DCD | UART_MSR_CTS)) { uart_port_unlock_irqrestore(port, flags); DEBUG_AUTOCONF("LOOP test failed (%02x) ", status1); goto out; } } /* * We're pretty sure there's a port here. Lets find out what * type of port it is. The IIR top two bits allows us to find * out if it's 8250 or 16450, 16550, 16550A or later. This * determines what we test for next. * * We also initialise the EFR (if any) to zero for later. The * EFR occupies the same register location as the FCR and IIR. */ serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(up, UART_EFR, 0); serial_out(up, UART_LCR, 0); serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); switch (serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED) { case UART_IIR_FIFO_ENABLED_8250: autoconfig_8250(up); break; case UART_IIR_FIFO_ENABLED_16550: port->type = PORT_16550; break; case UART_IIR_FIFO_ENABLED_16550A: autoconfig_16550a(up); break; default: port->type = PORT_UNKNOWN; break; } #ifdef CONFIG_SERIAL_8250_RSA /* * Only probe for RSA ports if we got the region. */ if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA && __enable_rsa(up)) port->type = PORT_RSA; #endif serial_out(up, UART_LCR, save_lcr); port->fifosize = uart_config[up->port.type].fifo_size; old_capabilities = up->capabilities; up->capabilities = uart_config[port->type].flags; up->tx_loadsz = uart_config[port->type].tx_loadsz; if (port->type == PORT_UNKNOWN) goto out_unlock; /* * Reset the UART. */ #ifdef CONFIG_SERIAL_8250_RSA if (port->type == PORT_RSA) serial_out(up, UART_RSA_FRR, 0); #endif serial8250_out_MCR(up, save_mcr); serial8250_clear_fifos(up); serial_in(up, UART_RX); serial8250_clear_IER(up); out_unlock: uart_port_unlock_irqrestore(port, flags); /* * Check if the device is a Fintek F81216A */ if (port->type == PORT_16550A && port->iotype == UPIO_PORT) fintek_8250_probe(up); if (up->capabilities != old_capabilities) { dev_warn(port->dev, "detected caps %08x should be %08x\n", old_capabilities, up->capabilities); } out: DEBUG_AUTOCONF("iir=%d ", scratch); DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name); } static void autoconfig_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; unsigned char save_mcr, save_ier; unsigned char save_ICP = 0; unsigned int ICP = 0; unsigned long irqs; int irq; if (port->flags & UPF_FOURPORT) { ICP = (port->iobase & 0xfe0) | 0x1f; save_ICP = inb_p(ICP); outb_p(0x80, ICP); inb_p(ICP); } if (uart_console(port)) console_lock(); /* forget possible initially masked and pending IRQ */ probe_irq_off(probe_irq_on()); save_mcr = serial8250_in_MCR(up); /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); save_ier = serial_in(up, UART_IER); uart_port_unlock_irq(port); serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2); irqs = probe_irq_on(); serial8250_out_MCR(up, 0); udelay(10); if (port->flags & UPF_FOURPORT) { serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } else { serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2); } /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); serial_out(up, UART_IER, UART_IER_ALL_INTR); uart_port_unlock_irq(port); serial_in(up, UART_LSR); serial_in(up, UART_RX); serial_in(up, UART_IIR); serial_in(up, UART_MSR); serial_out(up, UART_TX, 0xFF); udelay(20); irq = probe_irq_off(irqs); serial8250_out_MCR(up, save_mcr); /* Synchronize UART_IER access against the console. */ uart_port_lock_irq(port); serial_out(up, UART_IER, save_ier); uart_port_unlock_irq(port); if (port->flags & UPF_FOURPORT) outb_p(save_ICP, ICP); if (uart_console(port)) console_unlock(); port->irq = (irq > 0) ? irq : 0; } static void serial8250_stop_rx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); serial8250_rpm_get(up); up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; serial_port_out(port, UART_IER, up->ier); serial8250_rpm_put(up); } /** * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback * @p: uart 8250 port * * Generic callback usable by 8250 uart drivers to stop rs485 transmission. */ void serial8250_em485_stop_tx(struct uart_8250_port *p) { unsigned char mcr = serial8250_in_MCR(p); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND) mcr |= UART_MCR_RTS; else mcr &= ~UART_MCR_RTS; serial8250_out_MCR(p, mcr); /* * Empty the RX FIFO, we are not interested in anything * received during the half-duplex transmission. * Enable previously disabled RX interrupts. */ if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; serial_port_out(&p->port, UART_IER, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) { struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, stop_tx_timer); struct uart_8250_port *p = em485->port; unsigned long flags; serial8250_rpm_get(p); uart_port_lock_irqsave(&p->port, &flags); if (em485->active_timer == &em485->stop_tx_timer) { p->rs485_stop_tx(p); em485->active_timer = NULL; em485->tx_stopped = true; } uart_port_unlock_irqrestore(&p->port, flags); serial8250_rpm_put(p); return HRTIMER_NORESTART; } static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec) { hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL); } static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay) { struct uart_8250_em485 *em485 = p->em485; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&p->port.lock); stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC; /* * rs485_stop_tx() is going to set RTS according to config * AND flush RX FIFO if required. */ if (stop_delay > 0) { em485->active_timer = &em485->stop_tx_timer; hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL); } else { p->rs485_stop_tx(p); em485->active_timer = NULL; em485->tx_stopped = true; } } static inline void __stop_tx(struct uart_8250_port *p) { struct uart_8250_em485 *em485 = p->em485; if (em485) { u16 lsr = serial_lsr_in(p); u64 stop_delay = 0; if (!(lsr & UART_LSR_THRE)) return; /* * To provide required timing and allow FIFO transfer, * __stop_tx_rs485() must be called only when both FIFO and * shift register are empty. The device driver should either * enable interrupt on TEMT or set UART_CAP_NOTEMT that will * enlarge stop_tx_timer by the tx time of one frame to cover * for emptying of the shift register. */ if (!(lsr & UART_LSR_TEMT)) { if (!(p->capabilities & UART_CAP_NOTEMT)) return; /* * RTS might get deasserted too early with the normal * frame timing formula. It seems to suggest THRE might * get asserted already during tx of the stop bit * rather than after it is fully sent. * Roughly estimate 1 extra bit here with / 7. */ stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7); } __stop_tx_rs485(p, stop_delay); } if (serial8250_clear_THRI(p)) serial8250_rpm_put_tx(p); } static void serial8250_stop_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); __stop_tx(up); /* * We really want to stop the transmitter from sending. */ if (port->type == PORT_16C950) { up->acr |= UART_ACR_TXDIS; serial_icr_write(up, UART_ACR, up->acr); } serial8250_rpm_put(up); } static inline void __start_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); if (up->dma && !up->dma->tx_dma(up)) return; if (serial8250_set_THRI(up)) { if (up->bugs & UART_BUG_TXEN) { u16 lsr = serial_lsr_in(up); if (lsr & UART_LSR_THRE) serial8250_tx_chars(up); } } /* * Re-enable the transmitter if we disabled it. */ if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) { up->acr &= ~UART_ACR_TXDIS; serial_icr_write(up, UART_ACR, up->acr); } } /** * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback * @up: uart 8250 port * * Generic callback usable by 8250 uart drivers to start rs485 transmission. * Assumes that setting the RTS bit in the MCR register means RTS is high. * (Some chips use inverse semantics.) Further assumes that reception is * stoppable by disabling the UART_IER_RDI interrupt. (Some chips set the * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.) */ void serial8250_em485_start_tx(struct uart_8250_port *up) { unsigned char mcr = serial8250_in_MCR(up); if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX)) serial8250_stop_rx(&up->port); if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND) mcr |= UART_MCR_RTS; else mcr &= ~UART_MCR_RTS; serial8250_out_MCR(up, mcr); } EXPORT_SYMBOL_GPL(serial8250_em485_start_tx); /* Returns false, if start_tx_timer was setup to defer TX start */ static bool start_tx_rs485(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); struct uart_8250_em485 *em485 = up->em485; /* * While serial8250_em485_handle_stop_tx() is a noop if * em485->active_timer != &em485->stop_tx_timer, it might happen that * the timer is still armed and triggers only after the current bunch of * chars is send and em485->active_timer == &em485->stop_tx_timer again. * So cancel the timer. There is still a theoretical race condition if * the timer is already running and only comes around to check for * em485->active_timer when &em485->stop_tx_timer is armed again. */ if (em485->active_timer == &em485->stop_tx_timer) hrtimer_try_to_cancel(&em485->stop_tx_timer); em485->active_timer = NULL; if (em485->tx_stopped) { em485->tx_stopped = false; up->rs485_start_tx(up); if (up->port.rs485.delay_rts_before_send > 0) { em485->active_timer = &em485->start_tx_timer; start_hrtimer_ms(&em485->start_tx_timer, up->port.rs485.delay_rts_before_send); return false; } } return true; } static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t) { struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485, start_tx_timer); struct uart_8250_port *p = em485->port; unsigned long flags; uart_port_lock_irqsave(&p->port, &flags); if (em485->active_timer == &em485->start_tx_timer) { __start_tx(&p->port); em485->active_timer = NULL; } uart_port_unlock_irqrestore(&p->port, flags); return HRTIMER_NORESTART; } static void serial8250_start_tx(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); struct uart_8250_em485 *em485 = up->em485; /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); if (!port->x_char && uart_circ_empty(&port->state->xmit)) return; serial8250_rpm_get_tx(up); if (em485) { if ((em485->active_timer == &em485->start_tx_timer) || !start_tx_rs485(port)) return; } __start_tx(port); } static void serial8250_throttle(struct uart_port *port) { port->throttle(port); } static void serial8250_unthrottle(struct uart_port *port) { port->unthrottle(port); } static void serial8250_disable_ms(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); /* no MSR capabilities */ if (up->bugs & UART_BUG_NOMSR) return; mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; serial_port_out(port, UART_IER, up->ier); } static void serial8250_enable_ms(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&port->lock); /* no MSR capabilities */ if (up->bugs & UART_BUG_NOMSR) return; mctrl_gpio_enable_ms(up->gpios); up->ier |= UART_IER_MSI; serial8250_rpm_get(up); serial_port_out(port, UART_IER, up->ier); serial8250_rpm_put(up); } void serial8250_read_char(struct uart_8250_port *up, u16 lsr) { struct uart_port *port = &up->port; u8 ch, flag = TTY_NORMAL; if (likely(lsr & UART_LSR_DR)) ch = serial_in(up, UART_RX); else /* * Intel 82571 has a Serial Over Lan device that will * set UART_LSR_BI without setting UART_LSR_DR when * it receives a break. To avoid reading from the * receive buffer without UART_LSR_DR bit set, we * just force the read character to be 0 */ ch = 0; port->icount.rx++; lsr |= up->lsr_saved_flags; up->lsr_saved_flags = 0; if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) { if (lsr & UART_LSR_BI) { lsr &= ~(UART_LSR_FE | UART_LSR_PE); port->icount.brk++; /* * We do the SysRQ and SAK checking * here because otherwise the break * may get masked by ignore_status_mask * or read_status_mask. */ if (uart_handle_break(port)) return; } else if (lsr & UART_LSR_PE) port->icount.parity++; else if (lsr & UART_LSR_FE) port->icount.frame++; if (lsr & UART_LSR_OE) port->icount.overrun++; /* * Mask off conditions which should be ignored. */ lsr &= port->read_status_mask; if (lsr & UART_LSR_BI) { dev_dbg(port->dev, "handling break\n"); flag = TTY_BREAK; } else if (lsr & UART_LSR_PE) flag = TTY_PARITY; else if (lsr & UART_LSR_FE) flag = TTY_FRAME; } if (uart_prepare_sysrq_char(port, ch)) return; uart_insert_char(port, lsr, UART_LSR_OE, ch, flag); } EXPORT_SYMBOL_GPL(serial8250_read_char); /* * serial8250_rx_chars - Read characters. The first LSR value must be passed in. * * Returns LSR bits. The caller should rely only on non-Rx related LSR bits * (such as THRE) because the LSR value might come from an already consumed * character. */ u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr) { struct uart_port *port = &up->port; int max_count = 256; do { serial8250_read_char(up, lsr); if (--max_count == 0) break; lsr = serial_in(up, UART_LSR); } while (lsr & (UART_LSR_DR | UART_LSR_BI)); tty_flip_buffer_push(&port->state->port); return lsr; } EXPORT_SYMBOL_GPL(serial8250_rx_chars); void serial8250_tx_chars(struct uart_8250_port *up) { struct uart_port *port = &up->port; struct circ_buf *xmit = &port->state->xmit; int count; if (port->x_char) { uart_xchar_out(port, UART_TX); return; } if (uart_tx_stopped(port)) { serial8250_stop_tx(port); return; } if (uart_circ_empty(xmit)) { __stop_tx(up); return; } count = up->tx_loadsz; do { serial_out(up, UART_TX, xmit->buf[xmit->tail]); if (up->bugs & UART_BUG_TXRACE) { /* * The Aspeed BMC virtual UARTs have a bug where data * may get stuck in the BMC's Tx FIFO from bursts of * writes on the APB interface. * * Delay back-to-back writes by a read cycle to avoid * stalling the VUART. Read a register that won't have * side-effects and discard the result. */ serial_in(up, UART_SCR); } uart_xmit_advance(port, 1); if (uart_circ_empty(xmit)) break; if ((up->capabilities & UART_CAP_HFIFO) && !uart_lsr_tx_empty(serial_in(up, UART_LSR))) break; /* The BCM2835 MINI UART THRE bit is really a not-full bit. */ if ((up->capabilities & UART_CAP_MINI) && !(serial_in(up, UART_LSR) & UART_LSR_THRE)) break; } while (--count > 0); if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) uart_write_wakeup(port); /* * With RPM enabled, we have to wait until the FIFO is empty before the * HW can go idle. So we get here once again with empty FIFO and disable * the interrupt and RPM in __stop_tx() */ if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM)) __stop_tx(up); } EXPORT_SYMBOL_GPL(serial8250_tx_chars); /* Caller holds uart port lock */ unsigned int serial8250_modem_status(struct uart_8250_port *up) { struct uart_port *port = &up->port; unsigned int status = serial_in(up, UART_MSR); status |= up->msr_saved_flags; up->msr_saved_flags = 0; if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI && port->state != NULL) { if (status & UART_MSR_TERI) port->icount.rng++; if (status & UART_MSR_DDSR) port->icount.dsr++; if (status & UART_MSR_DDCD) uart_handle_dcd_change(port, status & UART_MSR_DCD); if (status & UART_MSR_DCTS) uart_handle_cts_change(port, status & UART_MSR_CTS); wake_up_interruptible(&port->state->port.delta_msr_wait); } return status; } EXPORT_SYMBOL_GPL(serial8250_modem_status); static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) { switch (iir & 0x3f) { case UART_IIR_THRI: /* * Postpone DMA or not decision to IIR_RDI or IIR_RX_TIMEOUT * because it's impossible to do an informed decision about * that with IIR_THRI. * * This also fixes one known DMA Rx corruption issue where * DR is asserted but DMA Rx only gets a corrupted zero byte * (too early DR?). */ return false; case UART_IIR_RDI: if (!up->dma->rx_running) break; fallthrough; case UART_IIR_RLSI: case UART_IIR_RX_TIMEOUT: serial8250_rx_dma_flush(up); return true; } return up->dma->rx_dma(up); } /* * This handles the interrupt from one port. */ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) { struct uart_8250_port *up = up_to_u8250p(port); struct tty_port *tport = &port->state->port; bool skip_rx = false; unsigned long flags; u16 status; if (iir & UART_IIR_NO_INT) return 0; uart_port_lock_irqsave(port, &flags); status = serial_lsr_in(up); /* * If port is stopped and there are no error conditions in the * FIFO, then don't drain the FIFO, as this may lead to TTY buffer * overflow. Not servicing, RX FIFO would trigger auto HW flow * control when FIFO occupancy reaches preset threshold, thus * halting RX. This only works when auto HW flow control is * available. */ if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) && (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) && !(port->read_status_mask & UART_LSR_DR)) skip_rx = true; if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { struct irq_data *d; d = irq_get_irq_data(port->irq); if (d && irqd_is_wakeup_set(d)) pm_wakeup_event(tport->tty->dev, 0); if (!up->dma || handle_rx_dma(up, iir)) status = serial8250_rx_chars(up, status); } serial8250_modem_status(up); if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) { if (!up->dma || up->dma->tx_err) serial8250_tx_chars(up); else if (!up->dma->tx_running) __stop_tx(up); } uart_unlock_and_check_sysrq_irqrestore(port, flags); return 1; } EXPORT_SYMBOL_GPL(serial8250_handle_irq); static int serial8250_default_handle_irq(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int iir; int ret; serial8250_rpm_get(up); iir = serial_port_in(port, UART_IIR); ret = serial8250_handle_irq(port, iir); serial8250_rpm_put(up); return ret; } /* * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP * have a programmable TX threshold that triggers the THRE interrupt in * the IIR register. In this case, the THRE interrupt indicates the FIFO * has space available. Load it up with tx_loadsz bytes. */ static int serial8250_tx_threshold_handle_irq(struct uart_port *port) { unsigned long flags; unsigned int iir = serial_port_in(port, UART_IIR); /* TX Threshold IRQ triggered so load up FIFO */ if ((iir & UART_IIR_ID) == UART_IIR_THRI) { struct uart_8250_port *up = up_to_u8250p(port); uart_port_lock_irqsave(port, &flags); serial8250_tx_chars(up); uart_port_unlock_irqrestore(port, flags); } iir = serial_port_in(port, UART_IIR); return serial8250_handle_irq(port, iir); } static unsigned int serial8250_tx_empty(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int result = 0; unsigned long flags; serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); if (!serial8250_tx_dma_running(up) && uart_lsr_tx_empty(serial_lsr_in(up))) result = TIOCSER_TEMT; uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); return result; } unsigned int serial8250_do_get_mctrl(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int status; unsigned int val; serial8250_rpm_get(up); status = serial8250_modem_status(up); serial8250_rpm_put(up); val = serial8250_MSR_to_TIOCM(status); if (up->gpios) return mctrl_gpio_get(up->gpios, &val); return val; } EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl); static unsigned int serial8250_get_mctrl(struct uart_port *port) { if (port->get_mctrl) return port->get_mctrl(port); return serial8250_do_get_mctrl(port); } void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_8250_port *up = up_to_u8250p(port); unsigned char mcr; mcr = serial8250_TIOCM_to_MCR(mctrl); mcr |= up->mcr; serial8250_out_MCR(up, mcr); } EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl); static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl) { if (port->rs485.flags & SER_RS485_ENABLED) return; if (port->set_mctrl) port->set_mctrl(port, mctrl); else serial8250_do_set_mctrl(port, mctrl); } static void serial8250_break_ctl(struct uart_port *port, int break_state) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); if (break_state == -1) up->lcr |= UART_LCR_SBC; else up->lcr &= ~UART_LCR_SBC; serial_port_out(port, UART_LCR, up->lcr); uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); } static void wait_for_lsr(struct uart_8250_port *up, int bits) { unsigned int status, tmout = 10000; /* Wait up to 10ms for the character(s) to be sent. */ for (;;) { status = serial_lsr_in(up); if ((status & bits) == bits) break; if (--tmout == 0) break; udelay(1); touch_nmi_watchdog(); } } /* * Wait for transmitter & holding register to empty */ static void wait_for_xmitr(struct uart_8250_port *up, int bits) { unsigned int tmout; wait_for_lsr(up, bits); /* Wait up to 1s for flow control if necessary */ if (up->port.flags & UPF_CONS_FLOW) { for (tmout = 1000000; tmout; tmout--) { unsigned int msr = serial_in(up, UART_MSR); up->msr_saved_flags |= msr & MSR_SAVE_FLAGS; if (msr & UART_MSR_CTS) break; udelay(1); touch_nmi_watchdog(); } } } #ifdef CONFIG_CONSOLE_POLL /* * Console polling routines for writing and reading from the uart while * in an interrupt or debug context. */ static int serial8250_get_poll_char(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); int status; u16 lsr; serial8250_rpm_get(up); lsr = serial_port_in(port, UART_LSR); if (!(lsr & UART_LSR_DR)) { status = NO_POLL_CHAR; goto out; } status = serial_port_in(port, UART_RX); out: serial8250_rpm_put(up); return status; } static void serial8250_put_poll_char(struct uart_port *port, unsigned char c) { unsigned int ier; struct uart_8250_port *up = up_to_u8250p(port); /* * Normally the port is locked to synchronize UART_IER access * against the console. However, this function is only used by * KDB/KGDB, where it may not be possible to acquire the port * lock because all other CPUs are quiesced. The quiescence * should allow safe lockless usage here. */ serial8250_rpm_get(up); /* * First save the IER then disable the interrupts */ ier = serial_port_in(port, UART_IER); serial8250_clear_IER(up); wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); /* * Send the character out. */ serial_port_out(port, UART_TX, c); /* * Finally, wait for transmitter to become empty * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); serial_port_out(port, UART_IER, ier); serial8250_rpm_put(up); } #endif /* CONFIG_CONSOLE_POLL */ int serial8250_do_startup(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; unsigned char iir; int retval; u16 lsr; if (!port->fifosize) port->fifosize = uart_config[port->type].fifo_size; if (!up->tx_loadsz) up->tx_loadsz = uart_config[port->type].tx_loadsz; if (!up->capabilities) up->capabilities = uart_config[port->type].flags; up->mcr = 0; if (port->iotype != up->cur_iotype) set_io_from_upio(port); serial8250_rpm_get(up); if (port->type == PORT_16C950) { /* * Wake up and initialize UART * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->acr = 0; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); serial_port_out(port, UART_IER, 0); serial_port_out(port, UART_LCR, 0); serial_icr_write(up, UART_CSR, 0); /* Reset the UART */ serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); serial_port_out(port, UART_EFR, UART_EFR_ECB); serial_port_out(port, UART_LCR, 0); uart_port_unlock_irqrestore(port, flags); } if (port->type == PORT_DA830) { /* * Reset the port * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); serial_port_out(port, UART_IER, 0); serial_port_out(port, UART_DA830_PWREMU_MGMT, 0); uart_port_unlock_irqrestore(port, flags); mdelay(10); /* Enable Tx, Rx and free run mode */ serial_port_out(port, UART_DA830_PWREMU_MGMT, UART_DA830_PWREMU_MGMT_UTRST | UART_DA830_PWREMU_MGMT_URRST | UART_DA830_PWREMU_MGMT_FREE); } if (port->type == PORT_NPCM) { /* * Nuvoton calls the scratch register 'UART_TOR' (timeout * register). Enable it, and set TIOC (timeout interrupt * comparator) to be 0x20 for correct operation. */ serial_port_out(port, UART_NPCM_TOR, UART_NPCM_TOIE | 0x20); } #ifdef CONFIG_SERIAL_8250_RSA /* * If this is an RSA port, see if we can kick it up to the * higher speed clock. */ enable_rsa(up); #endif /* * Clear the FIFO buffers and disable them. * (they will be reenabled in set_termios()) */ serial8250_clear_fifos(up); /* * Clear the interrupt registers. */ serial_port_in(port, UART_LSR); serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); /* * At this point, there's no way the LSR could still be 0xff; * if it is, then bail out, because there's likely no UART * here. */ if (!(port->flags & UPF_BUGGY_UART) && (serial_port_in(port, UART_LSR) == 0xff)) { dev_info_ratelimited(port->dev, "LSR safety check engaged!\n"); retval = -ENODEV; goto out; } /* * For a XR16C850, we need to set the trigger levels */ if (port->type == PORT_16850) { unsigned char fctr; serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B); fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX); serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TRGD | UART_FCTR_RX); serial_port_out(port, UART_TRG, UART_TRG_96); serial_port_out(port, UART_FCTR, fctr | UART_FCTR_TRGD | UART_FCTR_TX); serial_port_out(port, UART_TRG, UART_TRG_96); serial_port_out(port, UART_LCR, 0); } /* * For the Altera 16550 variants, set TX threshold trigger level. */ if (((port->type == PORT_ALTR_16550_F32) || (port->type == PORT_ALTR_16550_F64) || (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) { /* Bounds checking of TX threshold (valid 0 to fifosize-2) */ if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) { dev_err(port->dev, "TX FIFO Threshold errors, skipping\n"); } else { serial_port_out(port, UART_ALTR_AFR, UART_ALTR_EN_TXFIFO_LW); serial_port_out(port, UART_ALTR_TX_LOW, port->fifosize - up->tx_loadsz); port->handle_irq = serial8250_tx_threshold_handle_irq; } } /* Check if we need to have shared IRQs */ if (port->irq && (up->port.flags & UPF_SHARE_IRQ)) up->port.irqflags |= IRQF_SHARED; retval = up->ops->setup_irq(up); if (retval) goto out; if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; if (port->irqflags & IRQF_SHARED) disable_irq_nosync(port->irq); /* * Test for UARTs that do not reassert THRE when the * transmitter is idle and the interrupt has already * been cleared. Real 16550s should always reassert * this interrupt whenever the transmitter is idle and * the interrupt is enabled. Delays are necessary to * allow register changes to become visible. * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); wait_for_xmitr(up, UART_LSR_THRE); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow THRE to set */ iir1 = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); serial_port_out_sync(port, UART_IER, UART_IER_THRI); udelay(1); /* allow a working UART time to re-assert THRE */ iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); uart_port_unlock_irqrestore(port, flags); if (port->irqflags & IRQF_SHARED) enable_irq(port->irq); /* * If the interrupt is not reasserted, or we otherwise * don't trust the iir, setup a timer to kick the UART * on a regular basis. */ if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) || up->port.flags & UPF_BUG_THRE) { up->bugs |= UART_BUG_THRE; } } up->ops->setup_timer(up); /* * Now, initialize the UART */ serial_port_out(port, UART_LCR, UART_LCR_WLEN8); uart_port_lock_irqsave(port, &flags); if (up->port.flags & UPF_FOURPORT) { if (!up->port.irq) up->port.mctrl |= TIOCM_OUT1; } else /* * Most PC uarts need OUT2 raised to enable interrupts. */ if (port->irq) up->port.mctrl |= TIOCM_OUT2; serial8250_set_mctrl(port, port->mctrl); /* * Serial over Lan (SoL) hack: * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be * used for Serial Over Lan. Those chips take a longer time than a * normal serial device to signalize that a transmission data was * queued. Due to that, the above test generally fails. One solution * would be to delay the reading of iir. However, this is not * reliable, since the timeout is variable. So, let's just don't * test if we receive TX irq. This way, we'll never enable * UART_BUG_TXEN. */ if (up->port.quirks & UPQ_NO_TXEN_TEST) goto dont_test_tx_en; /* * Do a quick test to see if we receive an interrupt when we enable * the TX irq. */ serial_port_out(port, UART_IER, UART_IER_THRI); lsr = serial_port_in(port, UART_LSR); iir = serial_port_in(port, UART_IIR); serial_port_out(port, UART_IER, 0); if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { if (!(up->bugs & UART_BUG_TXEN)) { up->bugs |= UART_BUG_TXEN; dev_dbg(port->dev, "enabling bad tx status workarounds\n"); } } else { up->bugs &= ~UART_BUG_TXEN; } dont_test_tx_en: uart_port_unlock_irqrestore(port, flags); /* * Clear the interrupt registers again for luck, and clear the * saved flags to avoid getting false values from polling * routines or the previous session. */ serial_port_in(port, UART_LSR); serial_port_in(port, UART_RX); serial_port_in(port, UART_IIR); serial_port_in(port, UART_MSR); up->lsr_saved_flags = 0; up->msr_saved_flags = 0; /* * Request DMA channels for both RX and TX. */ if (up->dma) { const char *msg = NULL; if (uart_console(port)) msg = "forbid DMA for kernel console"; else if (serial8250_request_dma(up)) msg = "failed to request DMA"; if (msg) { dev_warn_ratelimited(port->dev, "%s\n", msg); up->dma = NULL; } } /* * Set the IER shadow for rx interrupts but defer actual interrupt * enable until after the FIFOs are enabled; otherwise, an already- * active sender can swamp the interrupt handler with "too much work". */ up->ier = UART_IER_RLSI | UART_IER_RDI; if (port->flags & UPF_FOURPORT) { unsigned int icp; /* * Enable interrupts on the AST Fourport board */ icp = (port->iobase & 0xfe0) | 0x01f; outb_p(0x80, icp); inb_p(icp); } retval = 0; out: serial8250_rpm_put(up); return retval; } EXPORT_SYMBOL_GPL(serial8250_do_startup); static int serial8250_startup(struct uart_port *port) { if (port->startup) return port->startup(port); return serial8250_do_startup(port); } void serial8250_do_shutdown(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned long flags; serial8250_rpm_get(up); /* * Disable interrupts from this port * * Synchronize UART_IER access against the console. */ uart_port_lock_irqsave(port, &flags); up->ier = 0; serial_port_out(port, UART_IER, 0); uart_port_unlock_irqrestore(port, flags); synchronize_irq(port->irq); if (up->dma) serial8250_release_dma(up); uart_port_lock_irqsave(port, &flags); if (port->flags & UPF_FOURPORT) { /* reset interrupts on the AST Fourport board */ inb((port->iobase & 0xfe0) | 0x1f); port->mctrl |= TIOCM_OUT1; } else port->mctrl &= ~TIOCM_OUT2; serial8250_set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); /* * Disable break condition and FIFOs */ serial_port_out(port, UART_LCR, serial_port_in(port, UART_LCR) & ~UART_LCR_SBC); serial8250_clear_fifos(up); #ifdef CONFIG_SERIAL_8250_RSA /* * Reset the RSA board back to 115kbps compat mode. */ disable_rsa(up); #endif /* * Read data port to reset things, and then unlink from * the IRQ chain. */ serial_port_in(port, UART_RX); serial8250_rpm_put(up); up->ops->release_irq(up); } EXPORT_SYMBOL_GPL(serial8250_do_shutdown); static void serial8250_shutdown(struct uart_port *port) { if (port->shutdown) port->shutdown(port); else serial8250_do_shutdown(port); } /* Nuvoton NPCM UARTs have a custom divisor calculation */ static unsigned int npcm_get_divisor(struct uart_8250_port *up, unsigned int baud) { struct uart_port *port = &up->port; return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2; } static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) { upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER; struct uart_8250_port *up = up_to_u8250p(port); unsigned int quot; /* * Handle magic divisors for baud rates above baud_base on SMSC * Super I/O chips. We clamp custom rates from clk/6 and clk/12 * up to clk/4 (0x8001) and clk/8 (0x8002) respectively. These * magic divisors actually reprogram the baud rate generator's * reference clock derived from chips's 14.318MHz clock input. * * Documentation claims that with these magic divisors the base * frequencies of 7.3728MHz and 3.6864MHz are used respectively * for the extra baud rates of 460800bps and 230400bps rather * than the usual base frequency of 1.8462MHz. However empirical * evidence contradicts that. * * Instead bit 7 of the DLM register (bit 15 of the divisor) is * effectively used as a clock prescaler selection bit for the * base frequency of 7.3728MHz, always used. If set to 0, then * the base frequency is divided by 4 for use by the Baud Rate * Generator, for the usual arrangement where the value of 1 of * the divisor produces the baud rate of 115200bps. Conversely, * if set to 1 and high-speed operation has been enabled with the * Serial Port Mode Register in the Device Configuration Space, * then the base frequency is supplied directly to the Baud Rate * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003, * 0x8004, etc. the respective baud rates produced are 460800bps, * 230400bps, 153600bps, 115200bps, etc. * * In all cases only low 15 bits of the divisor are used to divide * the baud base and therefore 32767 is the maximum divisor value * possible, even though documentation says that the programmable * Baud Rate Generator is capable of dividing the internal PLL * clock by any divisor from 1 to 65535. */ if (magic_multiplier && baud >= port->uartclk / 6) quot = 0x8001; else if (magic_multiplier && baud >= port->uartclk / 12) quot = 0x8002; else if (up->port.type == PORT_NPCM) quot = npcm_get_divisor(up, baud); else quot = uart_get_divisor(port, baud); /* * Oxford Semi 952 rev B workaround */ if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) quot++; return quot; } static unsigned int serial8250_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) { if (port->get_divisor) return port->get_divisor(port, baud, frac); return serial8250_do_get_divisor(port, baud, frac); } static unsigned char serial8250_compute_lcr(struct uart_8250_port *up, tcflag_t c_cflag) { unsigned char cval; cval = UART_LCR_WLEN(tty_get_char_size(c_cflag)); if (c_cflag & CSTOPB) cval |= UART_LCR_STOP; if (c_cflag & PARENB) cval |= UART_LCR_PARITY; if (!(c_cflag & PARODD)) cval |= UART_LCR_EPAR; if (c_cflag & CMSPAR) cval |= UART_LCR_SPAR; return cval; } void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot, unsigned int quot_frac) { struct uart_8250_port *up = up_to_u8250p(port); /* Workaround to enable 115200 baud on OMAP1510 internal ports */ if (is_omap1510_8250(up)) { if (baud == 115200) { quot = 1; serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1); } else serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0); } /* * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2, * otherwise just set DLAB */ if (up->capabilities & UART_NATSEMI) serial_port_out(port, UART_LCR, 0xe0); else serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB); serial_dl_write(up, quot); } EXPORT_SYMBOL_GPL(serial8250_do_set_divisor); static void serial8250_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot, unsigned int quot_frac) { if (port->set_divisor) port->set_divisor(port, baud, quot, quot_frac); else serial8250_do_set_divisor(port, baud, quot, quot_frac); } static unsigned int serial8250_get_baud_rate(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { unsigned int tolerance = port->uartclk / 100; unsigned int min; unsigned int max; /* * Handle magic divisors for baud rates above baud_base on SMSC * Super I/O chips. Enable custom rates of clk/4 and clk/8, but * disable divisor values beyond 32767, which are unavailable. */ if (port->flags & UPF_MAGIC_MULTIPLIER) { min = port->uartclk / 16 / UART_DIV_MAX >> 1; max = (port->uartclk + tolerance) / 4; } else { min = port->uartclk / 16 / UART_DIV_MAX; max = (port->uartclk + tolerance) / 16; } /* * Ask the core to calculate the divisor for us. * Allow 1% tolerance at the upper limit so uart clks marginally * slower than nominal still match standard baud rates without * causing transmission errors. */ return uart_get_baud_rate(port, termios, old, min, max); } /* * Note in order to avoid the tty port mutex deadlock don't use the next method * within the uart port callbacks. Primarily it's supposed to be utilized to * handle a sudden reference clock rate change. */ void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk) { struct uart_8250_port *up = up_to_u8250p(port); struct tty_port *tport = &port->state->port; unsigned int baud, quot, frac = 0; struct ktermios *termios; struct tty_struct *tty; unsigned long flags; tty = tty_port_tty_get(tport); if (!tty) { mutex_lock(&tport->mutex); port->uartclk = uartclk; mutex_unlock(&tport->mutex); return; } down_write(&tty->termios_rwsem); mutex_lock(&tport->mutex); if (port->uartclk == uartclk) goto out_unlock; port->uartclk = uartclk; if (!tty_port_initialized(tport)) goto out_unlock; termios = &tty->termios; baud = serial8250_get_baud_rate(port, termios, NULL); quot = serial8250_get_divisor(port, baud, &frac); serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); uart_update_timeout(port, termios->c_cflag, baud); serial8250_set_divisor(port, baud, quot, frac); serial_port_out(port, UART_LCR, up->lcr); uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); out_unlock: mutex_unlock(&tport->mutex); up_write(&tty->termios_rwsem); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(serial8250_update_uartclk); void serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_8250_port *up = up_to_u8250p(port); unsigned char cval; unsigned long flags; unsigned int baud, quot, frac = 0; if (up->capabilities & UART_CAP_MINI) { termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR); if ((termios->c_cflag & CSIZE) == CS5 || (termios->c_cflag & CSIZE) == CS6) termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7; } cval = serial8250_compute_lcr(up, termios->c_cflag); baud = serial8250_get_baud_rate(port, termios, old); quot = serial8250_get_divisor(port, baud, &frac); /* * Ok, we're now changing the port state. Do it with * interrupts disabled. * * Synchronize UART_IER access against the console. */ serial8250_rpm_get(up); uart_port_lock_irqsave(port, &flags); up->lcr = cval; /* Save computed LCR */ if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) { if (baud < 2400 && !up->dma) { up->fcr &= ~UART_FCR_TRIGGER_MASK; up->fcr |= UART_FCR_TRIGGER_1; } } /* * MCR-based auto flow control. When AFE is enabled, RTS will be * deasserted when the receive FIFO contains more characters than * the trigger, or the MCR RTS bit is cleared. */ if (up->capabilities & UART_CAP_AFE) { up->mcr &= ~UART_MCR_AFE; if (termios->c_cflag & CRTSCTS) up->mcr |= UART_MCR_AFE; } /* * Update the per-port timeout. */ uart_update_timeout(port, termios->c_cflag, baud); port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR; if (termios->c_iflag & INPCK) port->read_status_mask |= UART_LSR_FE | UART_LSR_PE; if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) port->read_status_mask |= UART_LSR_BI; /* * Characters to ignore */ port->ignore_status_mask = 0; if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE; if (termios->c_iflag & IGNBRK) { port->ignore_status_mask |= UART_LSR_BI; /* * If we're ignoring parity and break indicators, * ignore overruns too (for real raw support). */ if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART_LSR_OE; } /* * ignore all characters if CREAD is not set */ if ((termios->c_cflag & CREAD) == 0) port->ignore_status_mask |= UART_LSR_DR; /* * CTS flow control flag and modem status interrupts */ up->ier &= ~UART_IER_MSI; if (!(up->bugs & UART_BUG_NOMSR) && UART_ENABLE_MS(&up->port, termios->c_cflag)) up->ier |= UART_IER_MSI; if (up->capabilities & UART_CAP_UUE) up->ier |= UART_IER_UUE; if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; serial_port_out(port, UART_IER, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; /* * TI16C752/Startech hardware flow control. FIXME: * - TI16C752 requires control thresholds to be set. * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled. */ if (termios->c_cflag & CRTSCTS) efr |= UART_EFR_CTS; serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B); if (port->flags & UPF_EXAR_EFR) serial_port_out(port, UART_XR_EFR, efr); else serial_port_out(port, UART_EFR, efr); } serial8250_set_divisor(port, baud, quot, frac); /* * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR * is written without DLAB set, this mode will be disabled. */ if (port->type == PORT_16750) serial_port_out(port, UART_FCR, up->fcr); serial_port_out(port, UART_LCR, up->lcr); /* reset DLAB */ if (port->type != PORT_16750) { /* emulated UARTs (Lucent Venus 167x) need two steps */ if (up->fcr & UART_FCR_ENABLE_FIFO) serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO); serial_port_out(port, UART_FCR, up->fcr); /* set fcr */ } serial8250_set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); serial8250_rpm_put(up); /* Don't rewrite B0 */ if (tty_termios_baud_rate(termios)) tty_termios_encode_baud_rate(termios, baud, baud); } EXPORT_SYMBOL(serial8250_do_set_termios); static void serial8250_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { if (port->set_termios) port->set_termios(port, termios, old); else serial8250_do_set_termios(port, termios, old); } void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios) { if (termios->c_line == N_PPS) { port->flags |= UPF_HARDPPS_CD; uart_port_lock_irq(port); serial8250_enable_ms(port); uart_port_unlock_irq(port); } else { port->flags &= ~UPF_HARDPPS_CD; if (!UART_ENABLE_MS(port, termios->c_cflag)) { uart_port_lock_irq(port); serial8250_disable_ms(port); uart_port_unlock_irq(port); } } } EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc); static void serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios) { if (port->set_ldisc) port->set_ldisc(port, termios); else serial8250_do_set_ldisc(port, termios); } void serial8250_do_pm(struct uart_port *port, unsigned int state, unsigned int oldstate) { struct uart_8250_port *p = up_to_u8250p(port); serial8250_set_sleep(p, state != 0); } EXPORT_SYMBOL(serial8250_do_pm); static void serial8250_pm(struct uart_port *port, unsigned int state, unsigned int oldstate) { if (port->pm) port->pm(port, state, oldstate); else serial8250_do_pm(port, state, oldstate); } static unsigned int serial8250_port_size(struct uart_8250_port *pt) { if (pt->port.mapsize) return pt->port.mapsize; if (is_omap1_8250(pt)) return 0x16 << pt->port.regshift; return 8 << pt->port.regshift; } /* * Resource handling. */ static int serial8250_request_std_resource(struct uart_8250_port *up) { unsigned int size = serial8250_port_size(up); struct uart_port *port = &up->port; int ret = 0; switch (port->iotype) { case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_MEM16: case UPIO_MEM: if (!port->mapbase) { ret = -EINVAL; break; } if (!request_mem_region(port->mapbase, size, "serial")) { ret = -EBUSY; break; } if (port->flags & UPF_IOREMAP) { port->membase = ioremap(port->mapbase, size); if (!port->membase) { release_mem_region(port->mapbase, size); ret = -ENOMEM; } } break; case UPIO_HUB6: case UPIO_PORT: if (!request_region(port->iobase, size, "serial")) ret = -EBUSY; break; } return ret; } static void serial8250_release_std_resource(struct uart_8250_port *up) { unsigned int size = serial8250_port_size(up); struct uart_port *port = &up->port; switch (port->iotype) { case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_MEM16: case UPIO_MEM: if (!port->mapbase) break; if (port->flags & UPF_IOREMAP) { iounmap(port->membase); port->membase = NULL; } release_mem_region(port->mapbase, size); break; case UPIO_HUB6: case UPIO_PORT: release_region(port->iobase, size); break; } } static void serial8250_release_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); serial8250_release_std_resource(up); } static int serial8250_request_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); return serial8250_request_std_resource(up); } static int fcr_get_rxtrig_bytes(struct uart_8250_port *up) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; unsigned char bytes; bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)]; return bytes ? bytes : -EOPNOTSUPP; } static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; int i; if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)]) return -EOPNOTSUPP; for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) { if (bytes < conf_type->rxtrig_bytes[i]) /* Use the nearest lower value */ return (--i) << UART_FCR_R_TRIG_SHIFT; } return UART_FCR_R_TRIG_11; } static int do_get_rxtrig(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = state->uart_port; struct uart_8250_port *up = up_to_u8250p(uport); if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) return -EINVAL; return fcr_get_rxtrig_bytes(up); } static int do_serial8250_get_rxtrig(struct tty_port *port) { int rxtrig_bytes; mutex_lock(&port->mutex); rxtrig_bytes = do_get_rxtrig(port); mutex_unlock(&port->mutex); return rxtrig_bytes; } static ssize_t rx_trig_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tty_port *port = dev_get_drvdata(dev); int rxtrig_bytes; rxtrig_bytes = do_serial8250_get_rxtrig(port); if (rxtrig_bytes < 0) return rxtrig_bytes; return sysfs_emit(buf, "%d\n", rxtrig_bytes); } static int do_set_rxtrig(struct tty_port *port, unsigned char bytes) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = state->uart_port; struct uart_8250_port *up = up_to_u8250p(uport); int rxtrig; if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1) return -EINVAL; rxtrig = bytes_to_fcr_rxtrig(up, bytes); if (rxtrig < 0) return rxtrig; serial8250_clear_fifos(up); up->fcr &= ~UART_FCR_TRIGGER_MASK; up->fcr |= (unsigned char)rxtrig; serial_out(up, UART_FCR, up->fcr); return 0; } static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes) { int ret; mutex_lock(&port->mutex); ret = do_set_rxtrig(port, bytes); mutex_unlock(&port->mutex); return ret; } static ssize_t rx_trig_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct tty_port *port = dev_get_drvdata(dev); unsigned char bytes; int ret; if (!count) return -EINVAL; ret = kstrtou8(buf, 10, &bytes); if (ret < 0) return ret; ret = do_serial8250_set_rxtrig(port, bytes); if (ret < 0) return ret; return count; } static DEVICE_ATTR_RW(rx_trig_bytes); static struct attribute *serial8250_dev_attrs[] = { &dev_attr_rx_trig_bytes.attr, NULL }; static struct attribute_group serial8250_dev_attr_group = { .attrs = serial8250_dev_attrs, }; static void register_dev_spec_attr_grp(struct uart_8250_port *up) { const struct serial8250_config *conf_type = &uart_config[up->port.type]; if (conf_type->rxtrig_bytes[0]) up->port.attr_group = &serial8250_dev_attr_group; } static void serial8250_config_port(struct uart_port *port, int flags) { struct uart_8250_port *up = up_to_u8250p(port); int ret; /* * Find the region that we can probe for. This in turn * tells us whether we can probe for the type of port. */ ret = serial8250_request_std_resource(up); if (ret < 0) return; if (port->iotype != up->cur_iotype) set_io_from_upio(port); if (flags & UART_CONFIG_TYPE) autoconfig(up); /* HW bugs may trigger IRQ while IIR == NO_INT */ if (port->type == PORT_TEGRA) up->bugs |= UART_BUG_NOMSR; if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ) autoconfig_irq(up); if (port->type == PORT_UNKNOWN) serial8250_release_std_resource(up); register_dev_spec_attr_grp(up); up->fcr = uart_config[up->port.type].fcr; } static int serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) { if (ser->irq >= nr_irqs || ser->irq < 0 || ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS || ser->type == PORT_STARTECH) return -EINVAL; return 0; } static const char *serial8250_type(struct uart_port *port) { int type = port->type; if (type >= ARRAY_SIZE(uart_config)) type = 0; return uart_config[type].name; } static const struct uart_ops serial8250_pops = { .tx_empty = serial8250_tx_empty, .set_mctrl = serial8250_set_mctrl, .get_mctrl = serial8250_get_mctrl, .stop_tx = serial8250_stop_tx, .start_tx = serial8250_start_tx, .throttle = serial8250_throttle, .unthrottle = serial8250_unthrottle, .stop_rx = serial8250_stop_rx, .enable_ms = serial8250_enable_ms, .break_ctl = serial8250_break_ctl, .startup = serial8250_startup, .shutdown = serial8250_shutdown, .set_termios = serial8250_set_termios, .set_ldisc = serial8250_set_ldisc, .pm = serial8250_pm, .type = serial8250_type, .release_port = serial8250_release_port, .request_port = serial8250_request_port, .config_port = serial8250_config_port, .verify_port = serial8250_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_get_char = serial8250_get_poll_char, .poll_put_char = serial8250_put_poll_char, #endif }; void serial8250_init_port(struct uart_8250_port *up) { struct uart_port *port = &up->port; spin_lock_init(&port->lock); port->ctrl_id = 0; port->pm = NULL; port->ops = &serial8250_pops; port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); up->cur_iotype = 0xFF; } EXPORT_SYMBOL_GPL(serial8250_init_port); void serial8250_set_defaults(struct uart_8250_port *up) { struct uart_port *port = &up->port; if (up->port.flags & UPF_FIXED_TYPE) { unsigned int type = up->port.type; if (!up->port.fifosize) up->port.fifosize = uart_config[type].fifo_size; if (!up->tx_loadsz) up->tx_loadsz = uart_config[type].tx_loadsz; if (!up->capabilities) up->capabilities = uart_config[type].flags; } set_io_from_upio(port); /* default dma handlers */ if (up->dma) { if (!up->dma->tx_dma) up->dma->tx_dma = serial8250_tx_dma; if (!up->dma->rx_dma) up->dma->rx_dma = serial8250_rx_dma; } } EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) { struct uart_8250_port *up = up_to_u8250p(port); wait_for_xmitr(up, UART_LSR_THRE); serial_port_out(port, UART_TX, ch); } /* * Restore serial console when h/w power-off detected */ static void serial8250_console_restore(struct uart_8250_port *up) { struct uart_port *port = &up->port; struct ktermios termios; unsigned int baud, quot, frac = 0; termios.c_cflag = port->cons->cflag; termios.c_ispeed = port->cons->ispeed; termios.c_ospeed = port->cons->ospeed; if (port->state->port.tty && termios.c_cflag == 0) { termios.c_cflag = port->state->port.tty->termios.c_cflag; termios.c_ispeed = port->state->port.tty->termios.c_ispeed; termios.c_ospeed = port->state->port.tty->termios.c_ospeed; } baud = serial8250_get_baud_rate(port, &termios, NULL); quot = serial8250_get_divisor(port, baud, &frac); serial8250_set_divisor(port, baud, quot, frac); serial_port_out(port, UART_LCR, up->lcr); serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } /* * Print a string to the serial port using the device FIFO * * It sends fifosize bytes and then waits for the fifo * to get empty. */ static void serial8250_console_fifo_write(struct uart_8250_port *up, const char *s, unsigned int count) { int i; const char *end = s + count; unsigned int fifosize = up->tx_loadsz; bool cr_sent = false; while (s != end) { wait_for_lsr(up, UART_LSR_THRE); for (i = 0; i < fifosize && s != end; ++i) { if (*s == '\n' && !cr_sent) { serial_out(up, UART_TX, '\r'); cr_sent = true; } else { serial_out(up, UART_TX, *s++); cr_sent = false; } } } } /* * Print a string to the serial port trying not to disturb * any possible real use of the port... * * The console_lock must be held when we get here. * * Doing runtime PM is really a bad idea for the kernel console. * Thus, we assume the function is called when device is powered up. */ void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count) { struct uart_8250_em485 *em485 = up->em485; struct uart_port *port = &up->port; unsigned long flags; unsigned int ier, use_fifo; int locked = 1; touch_nmi_watchdog(); if (oops_in_progress) locked = uart_port_trylock_irqsave(port, &flags); else uart_port_lock_irqsave(port, &flags); /* * First save the IER then disable the interrupts */ ier = serial_port_in(port, UART_IER); serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { serial8250_console_restore(up); up->canary = 0; } if (em485) { if (em485->tx_stopped) up->rs485_start_tx(up); mdelay(port->rs485.delay_rts_before_send); } use_fifo = (up->capabilities & UART_CAP_FIFO) && /* * BCM283x requires to check the fifo * after each byte. */ !(up->capabilities & UART_CAP_MINI) && /* * tx_loadsz contains the transmit fifo size */ up->tx_loadsz > 1 && (up->fcr & UART_FCR_ENABLE_FIFO) && port->state && test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) && /* * After we put a data in the fifo, the controller will send * it regardless of the CTS state. Therefore, only use fifo * if we don't use control flow. */ !(up->port.flags & UPF_CONS_FLOW); if (likely(use_fifo)) serial8250_console_fifo_write(up, s, count); else uart_console_write(port, s, count, serial8250_console_putchar); /* * Finally, wait for transmitter to become empty * and restore the IER */ wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); if (em485) { mdelay(port->rs485.delay_rts_after_send); if (em485->tx_stopped) up->rs485_stop_tx(up); } serial_port_out(port, UART_IER, ier); /* * The receive handling will happen properly because the * receive ready bit will still be set; it is not cleared * on read. However, modem control will not, we must * call it if we have saved something in the saved flags * while processing with interrupts off. */ if (up->msr_saved_flags) serial8250_modem_status(up); if (locked) uart_port_unlock_irqrestore(port, flags); } static unsigned int probe_baud(struct uart_port *port) { unsigned char lcr, dll, dlm; unsigned int quot; lcr = serial_port_in(port, UART_LCR); serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB); dll = serial_port_in(port, UART_DLL); dlm = serial_port_in(port, UART_DLM); serial_port_out(port, UART_LCR, lcr); quot = (dlm << 8) | dll; return (port->uartclk / 16) / quot; } int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { int baud = 9600; int bits = 8; int parity = 'n'; int flow = 'n'; int ret; if (!port->iobase && !port->membase) return -ENODEV; if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) baud = probe_baud(port); ret = uart_set_options(port, port->cons, baud, parity, bits, flow); if (ret) return ret; if (port->dev) pm_runtime_get_sync(port->dev); return 0; } int serial8250_console_exit(struct uart_port *port) { if (port->dev) pm_runtime_put_sync(port->dev); return 0; } #endif /* CONFIG_SERIAL_8250_CONSOLE */ MODULE_LICENSE("GPL"); |
| 17 17 17 29 29 17 29 29 8 29 29 1796 29 28 29 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } #endif /* Check if jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (ts->inidle) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { #ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } #endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(¤t->tick_dep_mask)) return false; if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { if (atomic_read(¤t->tick_dep_mask) || atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return ts->tick_stopped; } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!ts->idle_active)) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; ts->idle_active = 0; write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (ts->idle_active && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): */ timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); if (cpu != tick_do_timer_cpu && (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { ts->do_timer_last = 0; } /* Skip reprogram of event if it's not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!ts->tick_stopped) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop * this here, the jiffies might be stale and do_timer() never * gets invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; /* * Make sure the CPU doesn't get fooled by obsolete tick * deadline if it comes back online later. */ ts->next_tick = 0; return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_do_timer_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = ts->tick_stopped; tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); /* * Undo the effect of get_next_timer_interrupt() called from * tick_nohz_next_event(). */ timer_clear_idle(); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 1; tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!ts->inidle); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } /** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; idle_active = ts->idle_active; tick_stopped = ts->tick_stopped; if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). * * This low-resolution handler still makes use of some hrtimer APIs meanwhile * for convenience with expiration calculation and forwarding. */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (likely(!ts->tick_stopped)) { hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { if (!tick_nohz_enabled) return; ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * hrtimer_forward_now() function with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } /* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_nohz_highres_handler; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif memset(ts, 0, sizeof(*ts)); } #endif /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; } |
| 4232 4232 4233 4233 4228 4234 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor mediation of files * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/tty.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mount.h> #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/file.h" #include "include/match.h" #include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/label.h" static u32 map_mask_to_chr_mask(u32 mask) { u32 m = mask & PERMS_CHRS_MASK; if (mask & AA_MAY_GETATTR) m |= MAY_READ; if (mask & (AA_MAY_SETATTR | AA_MAY_CHMOD | AA_MAY_CHOWN)) m |= MAY_WRITE; return m; } /** * file_audit_cb - call back for file specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ static void file_audit_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); kuid_t fsuid = ad->subj_cred ? ad->subj_cred->fsuid : current_fsuid(); char str[10]; if (ad->request & AA_AUDIT_FILE_MASK) { aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, map_mask_to_chr_mask(ad->request)); audit_log_format(ab, " requested_mask=\"%s\"", str); } if (ad->denied & AA_AUDIT_FILE_MASK) { aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, map_mask_to_chr_mask(ad->denied)); audit_log_format(ab, " denied_mask=\"%s\"", str); } if (ad->request & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " fsuid=%d", from_kuid(&init_user_ns, fsuid)); audit_log_format(ab, " ouid=%d", from_kuid(&init_user_ns, ad->fs.ouid)); } if (ad->peer) { audit_log_format(ab, " target="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAG_VIEW_SUBNS, GFP_KERNEL); } else if (ad->fs.target) { audit_log_format(ab, " target="); audit_log_untrustedstring(ab, ad->fs.target); } } /** * aa_audit_file - handle the auditing of file operations * @subj_cred: cred of the subject * @profile: the profile being enforced (NOT NULL) * @perms: the permissions computed for the request (NOT NULL) * @op: operation being mediated * @request: permissions requested * @name: name of object being mediated (MAYBE NULL) * @target: name of target (MAYBE NULL) * @tlabel: target label (MAY BE NULL) * @ouid: object uid * @info: extra information message (MAYBE NULL) * @error: 0 if operation allowed else failure error code * * Returns: %0 or error on failure */ int aa_audit_file(const struct cred *subj_cred, struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error) { int type = AUDIT_APPARMOR_AUTO; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_FILE, op); ad.subj_cred = subj_cred; ad.request = request; ad.name = name; ad.fs.target = target; ad.peer = tlabel; ad.fs.ouid = ouid; ad.info = info; ad.error = error; ad.common.u.tsk = NULL; if (likely(!ad.error)) { u32 mask = perms->audit; if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) mask = 0xffff; /* mask off perms that are not being force audited */ ad.request &= mask; if (likely(!ad.request)) return 0; type = AUDIT_APPARMOR_AUDIT; } else { /* only report permissions that were denied */ ad.request = ad.request & ~perms->allow; AA_BUG(!ad.request); if (ad.request & perms->kill) type = AUDIT_APPARMOR_KILL; /* quiet known rejects, assumes quiet and kill do not overlap */ if ((ad.request & perms->quiet) && AUDIT_MODE(profile) != AUDIT_NOQUIET && AUDIT_MODE(profile) != AUDIT_ALL) ad.request &= ~perms->quiet; if (!ad.request) return ad.error; } ad.denied = ad.request & ~perms->allow; return aa_audit(type, profile, &ad, file_audit_cb); } /** * is_deleted - test if a file has been completely unlinked * @dentry: dentry of file to test for deletion (NOT NULL) * * Returns: true if deleted else false */ static inline bool is_deleted(struct dentry *dentry) { if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) return true; return false; } static int path_name(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, char *buffer, const char **name, struct path_cond *cond, u32 request) { struct aa_profile *profile; const char *info = NULL; int error; error = aa_path_name(path, flags, buffer, name, &info, labels_profile(label)->disconnected); if (error) { fn_for_each_confined(label, profile, aa_audit_file(subj_cred, profile, &nullperms, op, request, *name, NULL, NULL, cond->uid, info, error)); return error; } return 0; } struct aa_perms default_perms = {}; /** * aa_lookup_fperms - convert dfa compressed perms to internal perms * @file_rules: the aa_policydb to lookup perms for (NOT NULL) * @state: state in dfa * @cond: conditions to consider (NOT NULL) * * TODO: convert from dfa + state to permission entry * * Returns: a pointer to a file permission set */ struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond) { unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state]; if (!(file_rules->perms)) return &default_perms; if (uid_eq(current_fsuid(), cond->uid)) return &(file_rules->perms[index]); return &(file_rules->perms[index + 1]); } /** * aa_str_perms - find permission that match @name * @file_rules: the aa_policydb to match against (NOT NULL) * @start: state to start matching in * @name: string to match against dfa (NOT NULL) * @cond: conditions to consider for permission set computation (NOT NULL) * @perms: Returns - the permissions found when matching @name * * Returns: the final state in @dfa when beginning @start and walking @name */ aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms) { aa_state_t state; state = aa_dfa_match(file_rules->dfa, start, name); *perms = *(aa_lookup_fperms(file_rules, state, cond)); return state; } static int __aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); int e = 0; if (profile_unconfined(profile)) return 0; aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], name, cond, perms); if (request & ~perms->allow) e = -EACCES; return aa_audit_file(subj_cred, profile, perms, op, request, name, NULL, NULL, cond->uid, NULL, e); } static int profile_path_perm(const char *op, const struct cred *subj_cred, struct aa_profile *profile, const struct path *path, char *buffer, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { const char *name; int error; if (profile_unconfined(profile)) return 0; error = path_name(op, subj_cred, &profile->label, path, flags | profile->path_flags, buffer, &name, cond, request); if (error) return error; return __aa_path_perm(op, subj_cred, profile, name, request, cond, flags, perms); } /** * aa_path_perm - do permissions check & audit for @path * @op: operation being checked * @subj_cred: subject cred * @label: profile being enforced (NOT NULL) * @path: path to check permissions of (NOT NULL) * @flags: any additional path flags beyond what the profile specifies * @request: requested permissions * @cond: conditional info for this request (NOT NULL) * * Returns: %0 else error if access denied or other error */ int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond) { struct aa_perms perms = {}; struct aa_profile *profile; char *buffer = NULL; int error; flags |= PATH_DELEGATE_DELETED | (S_ISDIR(cond->mode) ? PATH_IS_DIR : 0); buffer = aa_get_buffer(false); if (!buffer) return -ENOMEM; error = fn_for_each_confined(label, profile, profile_path_perm(op, subj_cred, profile, path, buffer, request, cond, flags, &perms)); aa_put_buffer(buffer); return error; } /** * xindex_is_subset - helper for aa_path_link * @link: link permission set * @target: target permission set * * test target x permissions are equal OR a subset of link x permissions * this is done as part of the subset test, where a hardlink must have * a subset of permissions that the target has. * * Returns: true if subset else false */ static inline bool xindex_is_subset(u32 link, u32 target) { if (((link & ~AA_X_UNSAFE) != (target & ~AA_X_UNSAFE)) || ((link & AA_X_UNSAFE) && !(target & AA_X_UNSAFE))) return false; return true; } static int profile_path_link(const struct cred *subj_cred, struct aa_profile *profile, const struct path *link, char *buffer, const struct path *target, char *buffer2, struct path_cond *cond) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; u32 request = AA_MAY_LINK; aa_state_t state; int error; error = path_name(OP_LINK, subj_cred, &profile->label, link, profile->path_flags, buffer, &lname, cond, AA_MAY_LINK); if (error) goto audit; /* buffer2 freed below, tname is pointer in buffer2 */ error = path_name(OP_LINK, subj_cred, &profile->label, target, profile->path_flags, buffer2, &tname, cond, AA_MAY_LINK); if (error) goto audit; error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ state = aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) goto audit; /* test to see if target can be paired with link */ state = aa_dfa_null_transition(rules->file->dfa, state); aa_str_perms(rules->file, state, tname, cond, &perms); /* force audit/quiet masks for link are stored in the second entry * in the link pair. */ lperms.audit = perms.audit; lperms.quiet = perms.quiet; lperms.kill = perms.kill; if (!(perms.allow & AA_MAY_LINK)) { info = "target restricted"; lperms = perms; goto audit; } /* done if link subset test is not required */ if (!(perms.allow & AA_LINK_SUBSET)) goto done_tests; /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ request = lperms.allow & ~AA_MAY_LINK; lperms.allow &= perms.allow | AA_MAY_LINK; request |= AA_AUDIT_FILE_MASK & (lperms.allow & ~perms.allow); if (request & ~lperms.allow) { goto audit; } else if ((lperms.allow & MAY_EXEC) && !xindex_is_subset(lperms.xindex, perms.xindex)) { lperms.allow &= ~MAY_EXEC; request |= MAY_EXEC; info = "link not subset of target"; goto audit; } done_tests: error = 0; audit: return aa_audit_file(subj_cred, profile, &lperms, OP_LINK, request, lname, tname, NULL, cond->uid, info, error); } /** * aa_path_link - Handle hard link permission check * @subj_cred: subject cred * @label: the label being enforced (NOT NULL) * @old_dentry: the target dentry (NOT NULL) * @new_dir: directory the new link will be created in (NOT NULL) * @new_dentry: the link being created (NOT NULL) * * Handle the permission test for a link & target pair. Permission * is encoded as a pair where the link permission is determined * first, and if allowed, the target is tested. The target test * is done from the point of the link match (not start of DFA) * making the target permission dependent on the link permission match. * * The subset test if required forces that permissions granted * on link are a subset of the permission granted to target. * * Returns: %0 if allowed else error */ int aa_path_link(const struct cred *subj_cred, struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path_cond cond = { d_backing_inode(old_dentry)->i_uid, d_backing_inode(old_dentry)->i_mode }; char *buffer = NULL, *buffer2 = NULL; struct aa_profile *profile; int error; /* buffer freed below, lname is pointer in buffer */ buffer = aa_get_buffer(false); buffer2 = aa_get_buffer(false); error = -ENOMEM; if (!buffer || !buffer2) goto out; error = fn_for_each_confined(label, profile, profile_path_link(subj_cred, profile, &link, buffer, &target, buffer2, &cond)); out: aa_put_buffer(buffer); aa_put_buffer(buffer2); return error; } static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label, u32 request) { struct aa_label *l, *old; /* update caching of label on file_ctx */ spin_lock(&fctx->lock); old = rcu_dereference_protected(fctx->label, lockdep_is_held(&fctx->lock)); l = aa_label_merge(old, label, GFP_ATOMIC); if (l) { if (l != old) { rcu_assign_pointer(fctx->label, l); aa_put_label(old); } else aa_put_label(l); fctx->allow |= request; } spin_unlock(&fctx->lock); } static int __file_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied, bool in_atomic) { struct aa_profile *profile; struct aa_perms perms = {}; vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file), file_inode(file)); struct path_cond cond = { .uid = vfsuid_into_kuid(vfsuid), .mode = file_inode(file)->i_mode }; char *buffer; int flags, error; /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) /* TODO: check for revocation on stale profiles */ return 0; flags = PATH_DELEGATE_DELETED | (S_ISDIR(cond.mode) ? PATH_IS_DIR : 0); buffer = aa_get_buffer(in_atomic); if (!buffer) return -ENOMEM; /* check every profile in task label not in current cache */ error = fn_for_each_not_in_set(flabel, label, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); if (denied && !error) { /* * check every profile in file label that was not tested * in the initial check above. * * TODO: cache full perms so this only happens because of * conditionals * TODO: don't audit here */ if (label == flabel) error = fn_for_each(label, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); else error = fn_for_each_not_in_set(label, flabel, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); } if (!error) update_file_ctx(file_ctx(file), label, request); aa_put_buffer(buffer); return error; } static int __file_sock_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied) { struct socket *sock = (struct socket *) file->private_data; int error; AA_BUG(!sock); /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) return 0; /* TODO: improve to skip profiles cached in flabel */ error = aa_sock_file_perm(subj_cred, label, op, request, sock); if (denied) { /* TODO: improve to skip profiles checked above */ /* check every profile in file label to is cached */ last_error(error, aa_sock_file_perm(subj_cred, flabel, op, request, sock)); } if (!error) update_file_ctx(file_ctx(file), label, request); return error; } /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked * @subj_cred: subject cred * @label: label being enforced (NOT NULL) * @file: file to revalidate access permissions on (NOT NULL) * @request: requested permissions * @in_atomic: whether allocations need to be done in atomic context * * Returns: %0 if access allowed else error */ int aa_file_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct file *file, u32 request, bool in_atomic) { struct aa_file_ctx *fctx; struct aa_label *flabel; u32 denied; int error = 0; AA_BUG(!label); AA_BUG(!file); fctx = file_ctx(file); rcu_read_lock(); flabel = rcu_dereference(fctx->label); AA_BUG(!flabel); /* revalidate access, if task is unconfined, or the cached cred * doesn't match or if the request is for more permissions than * was granted. * * Note: the test for !unconfined(flabel) is to handle file * delegation from unconfined tasks */ denied = request & ~fctx->allow; if (unconfined(label) || unconfined(flabel) || (!denied && aa_label_is_subset(flabel, label))) { rcu_read_unlock(); goto done; } flabel = aa_get_newest_label(flabel); rcu_read_unlock(); /* TODO: label cross check */ if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry)) error = __file_path_perm(op, subj_cred, label, flabel, file, request, denied, in_atomic); else if (S_ISSOCK(file_inode(file)->i_mode)) error = __file_sock_perm(op, subj_cred, label, flabel, file, request, denied); aa_put_label(flabel); done: return error; } static void revalidate_tty(const struct cred *subj_cred, struct aa_label *label) { struct tty_struct *tty; int drop_tty = 0; tty = get_current_tty(); if (!tty) return; spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; struct file *file; /* TODO: Revalidate access to controlling tty. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (aa_file_perm(OP_INHERIT, subj_cred, label, file, MAY_READ | MAY_WRITE, IN_ATOMIC)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); if (drop_tty) no_tty(); } struct cred_label { const struct cred *cred; struct aa_label *label; }; static int match_file(const void *p, struct file *file, unsigned int fd) { struct cred_label *cl = (struct cred_label *)p; if (aa_file_perm(OP_INHERIT, cl->cred, cl->label, file, aa_map_file_to_perms(file), IN_ATOMIC)) return fd + 1; return 0; } /* based on selinux's flush_unauthorized_files */ void aa_inherit_files(const struct cred *cred, struct files_struct *files) { struct aa_label *label = aa_get_newest_cred_label(cred); struct cred_label cl = { .cred = cred, .label = label, }; struct file *devnull = NULL; unsigned int n; revalidate_tty(cred, label); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, &cl); if (!n) /* none found? */ goto out; devnull = dentry_open(&aa_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, &cl)) != 0); if (devnull) fput(devnull); out: aa_put_label(label); } |
| 12 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfilp(struct file *filp); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; bool strict_check; union { u8 ctx[48]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_DUMP_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */ |
| 13 8 8 8 8 1 114 2 28 26 9 8 541 8 132 21 21 13 309 308 123 4 4 11 11 11 132 132 131 107 16 132 19 19 132 132 5 534 534 3 3 3 534 289 267 534 534 534 546 544 534 15 13 520 536 132 132 132 9 1 1 1 9 22 22 8 8 8 8 8 8 534 534 5 341 12 70 284 289 281 281 280 309 11 2 66 66 88 49 49 173 156 155 28 127 150 8 8 150 147 7 148 146 144 145 22 22 5 118 4 4 4 4 114 114 45 114 114 114 114 114 14 173 18 1 17 17 16 16 16 1 16 16 18 223 188 213 22 19 210 7 204 204 204 188 188 188 188 188 188 188 188 188 204 16 1 10 146 87 87 87 11 87 139 257 1 188 184 184 184 97 110 110 100 72 5 1 98 98 98 86 6 98 3 97 87 45 87 10 97 98 180 53 139 7 183 63 2 63 63 63 161 15 14 7 6 6 7 7 6 4 162 13 6 4 217 218 207 208 208 162 1 147 97 63 52 2 1 159 4 159 159 159 1 159 159 150 150 3 137 10 3 5 136 139 149 163 163 2 218 151 216 8 6 1 1 8 1 1 745 6 1 6 2 1 2 742 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/af_unix.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/scm.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> #include <linux/splice.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> #include <linux/bpf-cgroup.h> #include "scm.h" static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock. */ static unsigned int unix_unbound_hash(struct sock *sk) { unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; hash ^= sk->sk_type; return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) { return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, int addr_len, int type) { __wsum csum = csum_partial(sunaddr, addr_len, 0); unsigned int hash; hash = (__force unsigned int)csum_fold(csum); hash ^= hash >> 8; hash ^= type; return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_lock(&net->unx.table.locks[hash1]); return; } if (hash1 > hash2) swap(hash1, hash2); spin_lock(&net->unx.table.locks[hash1]); spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_unlock(&net->unx.table.locks[hash1]); return; } spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { scm->secid = UNIXCB(skb).secid; } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return true; } #endif /* CONFIG_SECURITY_NETWORK */ static inline int unix_our_peer(struct sock *sk, struct sock *osk) { return unix_peer(osk) == sk; } static inline int unix_may_send(struct sock *sk, struct sock *osk) { return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } static inline int unix_recvq_full_lockless(const struct sock *sk) { return skb_queue_len_lockless(&sk->sk_receive_queue) > READ_ONCE(sk->sk_max_ack_backlog); } struct sock *unix_peer_get(struct sock *s) { struct sock *peer; unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); unix_state_unlock(s); return peer; } EXPORT_SYMBOL_GPL(unix_peer_get); static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, int addr_len) { struct unix_address *addr; addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); if (!addr) return NULL; refcount_set(&addr->refcnt, 1); addr->len = addr_len; memcpy(addr->name, sunaddr, addr_len); return addr; } static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) kfree(addr); } /* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) { if (addr_len <= offsetof(struct sockaddr_un, sun_path) || addr_len > sizeof(*sunaddr)) return -EINVAL; if (sunaddr->sun_family != AF_UNIX) return -EINVAL; return 0; } static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) { struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data); BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); /* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel(). */ addr->__data[addr_len - offset] = 0; /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer. */ return strlen(addr->__data) + offset + 1; } static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); } static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } static void __unix_set_addr_hash(struct net *net, struct sock *sk, struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_bsd_socket(struct sock *sk) { spin_lock(&bsd_socket_locks[sk->sk_hash]); sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); spin_unlock(&bsd_socket_locks[sk->sk_hash]); } static void unix_remove_bsd_socket(struct sock *sk) { if (!hlist_unhashed(&sk->sk_bind_node)) { spin_lock(&bsd_socket_locks[sk->sk_hash]); __sk_del_bind_node(sk); spin_unlock(&bsd_socket_locks[sk->sk_hash]); sk_node_init(&sk->sk_bind_node); } } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; } return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&bsd_socket_locks[hash]); sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&bsd_socket_locks[hash]); return s; } } spin_unlock(&bsd_socket_locks[hash]); return NULL; } /* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed. */ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; wait_queue_head_t *u_sleep; u = container_of(q, struct unix_sock, peer_wake); __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, q); u->peer_wake.private = NULL; /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; int rc; u = unix_sk(sk); u_other = unix_sk(other); rc = 0; spin_lock(&u_other->peer_wait.lock); if (!u->peer_wake.private) { u->peer_wake.private = other; __add_wait_queue(&u_other->peer_wait, &u->peer_wake); rc = 1; } spin_unlock(&u_other->peer_wait.lock); return rc; } static void unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; u = unix_sk(sk); u_other = unix_sk(other); spin_lock(&u_other->peer_wait.lock); if (u->peer_wake.private == other) { __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); u->peer_wake.private = NULL; } spin_unlock(&u_other->peer_wait.lock); } static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, struct sock *other) { unix_dgram_peer_wake_disconnect(sk, other); wake_up_interruptible_poll(sk_sleep(sk), EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } /* preconditions: * - unix_peer(sk) == other * - association is stable */ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) { int connected; connected = unix_dgram_peer_wake_connect(sk, other); /* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) unix_dgram_peer_wake_disconnect(sk, other); return 0; } static int unix_writable(const struct sock *sk) { return sk->sk_state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; } static void unix_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); if (unix_writable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge(&sk->sk_receive_queue); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us. */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { WRITE_ONCE(other->sk_err, ECONNRESET); sk_error_report(other); } } other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); skb_queue_purge(&sk->sk_receive_queue); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; } if (u->addr) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct sock *skpair; struct sk_buff *skb; struct path path; int state; unix_remove_socket(sock_net(sk), sk); unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); sock_orphan(sk); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; skpair = unix_peer(sk); unix_peer(sk) = NULL; unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (u->oob_skb) { kfree_skb(u->oob_skb); u->oob_skb = NULL; } #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ } /* Try to flush out this socket. Throw out buffers at least */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } if (path.dentry) path_put(&path); sock_put(sk); /* ---- Socket is dead now and most probably destroyed ---- */ /* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817) */ if (READ_ONCE(unix_tot_inflight)) unix_gc(); /* Garbage collect fds */ } static void init_peercred(struct sock *sk) { const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { const struct cred *old_cred; struct pid *old_pid; if (sk < peersk) { spin_lock(&sk->sk_peer_lock); spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&peersk->sk_peer_lock); spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); } old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); spin_unlock(&peersk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; if (!u->addr) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ init_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); out: return err; } static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); if (mutex_lock_interruptible(&u->iolock)) return -EINTR; WRITE_ONCE(sk->sk_peek_off, val); mutex_unlock(&u->iolock); return 0; } #ifdef CONFIG_PROC_FS static int unix_count_nr_fds(struct sock *sk) { struct sk_buff *skb; struct unix_sock *u; int nr_fds = 0; spin_lock(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); while (skb) { u = unix_sk(skb->sk); nr_fds += atomic_read(&u->scm_stat.nr_fds); skb = skb_peek_next(skb, &sk->sk_receive_queue); } spin_unlock(&sk->sk_receive_queue.lock); return nr_fds; } static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; unsigned char s_state; struct unix_sock *u; int nr_fds = 0; if (sk) { s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed. */ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else #define unix_show_fdinfo NULL #endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_dgram_connect, .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static void unix_close(struct sock *sk, long timeout) { /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap. */ } static void unix_unhash(struct sock *sk) { /* Nothing to do here, unix socket does not need a ->unhash(). * This is merely for sockmap. */ } static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: return true; default: return false; } } return false; } struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif }; struct proto unix_stream_proto = { .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct unix_sock *u; struct sock *sk; int err; atomic_long_inc(&unix_nr_socks); if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { err = -ENFILE; goto err; } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); if (!sk) { err = -ENOMEM; goto err; } sock_init_data(sock, sk); sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; err: atomic_long_dec(&unix_nr_socks); return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; switch (sock->type) { case SOCK_STREAM: sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ case SOCK_RAW: sock->type = SOCK_DGRAM; fallthrough; case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk); return 0; } static int unix_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int type) { struct inode *inode; struct path path; struct sock *sk; int err; unix_mkname_bsd(sunaddr, addr_len); err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; err = path_permission(&path, MAY_WRITE); if (err) goto path_put; err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); if (!S_ISSOCK(inode->i_mode)) goto path_put; sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; err = -EPROTOTYPE; if (sk->sk_type == type) touch_atime(&path); else goto sock_put; path_put(&path); return sk; sock_put: sock_put(sk); path_put: path_put(&path); fail: return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) touch_atime(&unix_sk(sk)->path); return sk; } static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { struct sock *sk; if (sunaddr->sun_path[0]) sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } static int unix_autobind(struct sock *sk) { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) goto out; err = -ENOMEM; addr = kzalloc(sizeof(*addr) + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); if (ordernum == lastnum) { /* Give up if all names seems to be in use. */ err = -ENOSPC; unix_release_addr(addr); goto out; } goto retry; } __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); return err; } static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; int err; addr_len = unix_mkname_bsd(sunaddr, addr_len); addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; } /* * All right, let's create it. */ idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; out_unlock: mutex_unlock(&u->bindlock); err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: done_path_create(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; } static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct net *net = sock_net(sk); struct unix_address *addr; int err; addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; if (u->addr) { err = -EINVAL; goto out_mutex; } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); out: unix_release_addr(addr); return err; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); err = unix_validate_addr(sunaddr, addr_len); if (err) return err; if (sunaddr->sun_path[0]) err = unix_bind_bsd(sk, sunaddr, addr_len); else err = unix_bind_abstract(sk, sunaddr, addr_len); return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_lock(sk1); return; } if (sk1 < sk2) { unix_state_lock(sk1); unix_state_lock_nested(sk2); } else { unix_state_lock(sk2); unix_state_lock_nested(sk1); } } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_unlock(sk1); return; } unix_state_unlock(sk1); unix_state_unlock(sk2); } static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *sk = sock->sk; struct sock *other; int err; err = -EINVAL; if (alen < offsetofend(struct sockaddr, sa_family)) goto out; if (addr->sa_family != AF_UNSPEC) { err = unix_validate_addr(sunaddr, alen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { err = unix_autobind(sk); if (err) goto out; } restart: other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; } unix_state_double_lock(sk, other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_double_unlock(sk, other); sock_put(other); goto restart; } err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; sk->sk_state = other->sk_state = TCP_ESTABLISHED; } else { /* * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; unix_state_double_lock(sk, other); } /* * If it was connected, reconnect. */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; if (!other) sk->sk_state = TCP_CLOSE; unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); if (other != old_peer) unix_dgram_disconnected(sk, old_peer); sock_put(old_peer); } else { unix_peer(sk) = other; unix_state_double_unlock(sk, other); } return 0; out_unlock: unix_state_double_unlock(sk, other); sock_put(other); out: return err; } static long unix_wait_for_peer(struct sock *other, long timeo) __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; DEFINE_WAIT(wait); prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && unix_recvq_full_lockless(other); unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); finish_wait(&u->peer_wait, &wait); return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; long timeo; int err; int st; err = unix_validate_addr(sunaddr, addr_len); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. If we will make it after state is locked, we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; goto out; } err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; goto out; } /* Latch state of peer */ unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_unlock(other); sock_put(other); goto restart; } err = -ECONNREFUSED; if (other->sk_state != TCP_LISTEN) goto out_unlock; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (unix_recvq_full(other)) { err = -EAGAIN; if (!timeo) goto out_unlock; timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; sock_put(other); goto restart; } /* Latch our state. It is tricky place. We need to grab our state lock and cannot drop lock on peer. It is dangerous because deadlock is possible. Connect to self case and simultaneous attempt to connect are eliminated by checking socket state. other is TCP_LISTEN, if sk is TCP_LISTEN we check this before attempt to grab lock. Well, and we have to recheck the state after socket locked. */ st = sk->sk_state; switch (st) { case TCP_CLOSE: /* This is ok... continue with connect */ break; case TCP_ESTABLISHED: /* Socket is already connected */ err = -EISCONN; goto out_unlock; default: err = -EINVAL; goto out_unlock; } unix_state_lock_nested(sk); if (sk->sk_state != st) { unix_state_unlock(sk); unix_state_unlock(other); sock_put(other); goto restart; } err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; } /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); /* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind(). */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } refcount_inc(&otheru->addr->refcnt); smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); return 0; out_unlock: if (other) unix_state_unlock(other); out: kfree_skb(skb); if (newsk) unix_release_sock(newsk, 0); if (other) sock_put(other); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { struct sock *ska = socka->sk, *skb = sockb->sk; /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; init_peercred(ska); init_peercred(skb); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; socka->state = SS_CONNECTED; sockb->state = SS_CONNECTED; return 0; } static void unix_sock_inherit_flags(const struct socket *old, struct socket *new) { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); if (test_bit(SOCK_PASSPIDFD, &old->flags)) set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sock *tsk; struct sk_buff *skb; int err; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, &err); if (!skb) { /* This means receive shutdown. */ if (err == 0) err = -EINVAL; goto out; } tsk = skb->sk; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; out: return err; } static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { sk = unix_peer_get(sk); err = -ENOTCONN; if (!sk) goto out; err = 0; } else { sock_hold(sk); } addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); if (peer) BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETPEERNAME); else BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: return err; } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); /* * Garbage collection of unix sockets starts by selecting a set of * candidate sockets which have reference only from being in flight * (total_refs == inflight_refs). This condition is checked once during * the candidate collection phase, and candidates are marked as such, so * that non-candidates can later be ignored. While inflight_refs is * protected by unix_gc_lock, total_refs (file count) is not, hence this * is an instantaneous decision. * * Once a candidate, however, the socket must not be reinstalled into a * file descriptor while the garbage collection is in progress. * * If the above conditions are met, then the directed graph of * candidates (*) does not change while unix_gc_lock is held. * * Any operations that changes the file count through file descriptors * (dup, close, sendmsg) does not change the graph since candidates are * not installed in fds. * * Dequeing a candidate via recvmsg would install it into an fd, but * that takes unix_gc_lock to decrement the inflight count, so it's * serialized with garbage collection. * * MSG_PEEK is special in that it does not change the inflight count, * yet does install the socket into an fd. The following lock/unlock * pair is to ensure serialization with garbage collection. It must be * done between incrementing the file count and installing the file into * an fd. * * If garbage collection starts after the barrier provided by the * lock/unlock, then it will see the elevated refcount and not mark this * as a candidate. If a garbage collection is already in progress * before the file count was incremented, then the lock/unlock pair will * ensure that garbage collection is finished before progressing to * installing the fd. * * (*) A -> B where B is on the queue of A or B is on the queue of C * which is on the queue of listening socket A. */ spin_lock(&unix_gc_lock); spin_unlock(&unix_gc_lock); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_destruct_scm; return err; } static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, const struct sock *other) { if (UNIXCB(skb).pid) return; if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { return UNIXCB(skb).pid == scm->pid && uid_eq(UNIXCB(skb).uid, scm->creds.uid) && gid_eq(UNIXCB(skb).gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } static void scm_stat_add(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) atomic_add(fp->count, &u->scm_stat.nr_fds); } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) atomic_sub(fp->count, &u->scm_stat.nr_fds); } /* * Send AF_UNIX data. */ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; struct sk_buff *skb; int data_len = 0; int sk_locked; long timeo; int err; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; if (msg->msg_namelen) { err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen, NULL); if (err) goto out; } else { sunaddr = NULL; err = -ENOTCONN; other = unix_peer_get(sk); if (!other) goto out; } if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); data_len = PAGE_ALIGN(data_len); BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); restart: if (!other) { err = -ECONNRESET; if (sunaddr == NULL) goto out_free; other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; goto out_free; } } if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; goto out_free; } sk_locked = 0; unix_state_lock(other); restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error */ unix_state_unlock(other); sock_put(other); if (!sk_locked) unix_state_lock(sk); err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ unix_state_unlock(sk); err = -EPIPE; } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); sk->sk_state = TCP_CLOSE; unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; } else { unix_state_unlock(sk); } other = NULL; if (err) goto out_free; goto restart; } err = -EPIPE; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; } /* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && unlikely(unix_peer(other) != sk && unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free; goto restart; } if (!sk_locked) { unix_state_unlock(other); unix_state_double_lock(sk, other); } if (unix_peer(sk) != other || unix_dgram_peer_wake_me(sk, other)) { err = -EAGAIN; sk_locked = 1; goto out_unlock; } if (!sk_locked) { sk_locked = 1; goto restart_locked; } } if (unlikely(sk_locked)) unix_state_unlock(sk); if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); scm_destroy(&scm); return len; out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); out: if (other) sock_put(other); scm_destroy(&scm); return err; } /* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err = 0; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); return err; } skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); if (err) { kfree_skb(skb); return err; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); kfree_skb(skb); return -EPIPE; } maybe_add_creds(skb, sock, other); skb_get(skb); if (ousk->oob_skb) consume_skb(ousk->oob_skb); WRITE_ONCE(ousk->oob_skb, skb); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); return err; } #endif static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; struct sk_buff *skb; int sent = 0; struct scm_cookie scm; bool fds_sent = false; int data_len; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else #endif goto out_err; } if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; other = unix_peer(sk); if (!other) goto out_err; } if (sk->sk_shutdown & SEND_SHUTDOWN) goto pipe_err; while (sent < len) { size = len - sent; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, msg->msg_flags & MSG_DONTWAIT, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); } if (!skb) goto out_err; /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); goto out_err; } fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); if (err < 0) { kfree_skb(skb); goto out_err; } size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) { kfree_skb(skb); goto out_err; } } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto pipe_err_free; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sent += size; } #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; } #endif scm_destroy(&scm); return sent; pipe_err_free: unix_state_unlock(other); kfree_skb(skb); pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: scm_destroy(&scm); return sent ? : err; } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk = sock->sk; err = sock_error(sk); if (err) return err; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) msg->msg_namelen = 0; return unix_dgram_sendmsg(sock, msg, len); } static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (addr) { msg->msg_namelen = addr->len; memcpy(msg->msg_name, addr->name, addr->len); } } int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct scm_cookie scm; struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; int skip; int err; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, &skip, &err, &last); if (skb) { if (!(flags & MSG_PEEK)) scm_stat_del(sk, skb); break; } mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); goto out; } if (wq_has_sleeper(&u->peer_wait)) wake_up_interruptible_sync_poll(&u->peer_wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (msg->msg_name) { unix_copy_addr(msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen); } if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); memset(&scm, 0, sizeof(scm)); scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however! */ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv_unix(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); mutex_unlock(&u->iolock); out: return err; } static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; return recv_actor(sk, skb); } /* * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } static unsigned int unix_skb_len(const struct sk_buff *skb) { return skb->len - UNIXCB(skb).consumed; } struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); struct socket *socket; struct msghdr *msg; struct pipe_inode_info *pipe; size_t size; int flags; unsigned int splice_flags; }; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; } oob_skb = u->oob_skb; if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); else skb_get(oob_skb); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; consume_skb(oob_skb); mutex_unlock(&u->iolock); if (chunk < 0) return -EFAULT; state->msg->msg_flags |= MSG_OOB; return 1; } static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { struct unix_sock *u = unix_sk(sk); if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); skb = NULL; } else { if (skb == u->oob_skb) { if (copied) { skb = NULL; } else if (sock_flag(sk, SOCK_URGINLINE)) { if (!(flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } } else if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); skb = skb_peek(&sk->sk_receive_queue); } } } return skb; } #endif static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { if (unlikely(sk->sk_state != TCP_ESTABLISHED)) return -ENOTCONN; return unix_read_skb(sk, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { struct scm_cookie scm; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int copied = 0; int flags = state->flags; int noblock = flags & MSG_DONTWAIT; bool check_creds = false; int target; int err = 0; long timeo; int skip; size_t size = state->size; unsigned int last_len; if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) err = unix_stream_recv_urg(state); #endif goto out; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); memset(&scm, 0, sizeof(scm)); /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ mutex_lock(&u->iolock); skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; bool drop_skb; struct sk_buff *skb, *last; redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); if (!skb) { unix_state_unlock(sk); if (copied) break; goto redo; } } #endif again: if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; break; } mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); scm_destroy(&scm); goto out; } mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); break; } while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; } unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); check_creds = true; } /* Copy address just once */ if (state->msg && state->msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, state->msg->msg_name, &state->msg->msg_namelen); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); skb_get(skb); chunk = state->recv_actor(skb, skip, chunk, state); drop_skb = !unix_skb_len(skb); /* skb is only safe to use if !drop_skb */ consume_skb(skb); if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; if (drop_skb) { /* the skb was touched by a concurrent reader; * we should not expect anything from this skb * anymore and assume it invalid - we can be * sure it was dropped from the socket queue * * let's report a short read */ err = 0; break; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) { scm_stat_del(sk, skb); unix_detach_fds(&scm, skb); } if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; skip = 0; last = skb; last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) goto again; unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->iolock); if (state->msg) scm_recv_unix(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: return copied ? : err; } static int unix_stream_read_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { int ret; ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, state->msg, chunk); return ret ?: chunk; } int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sk->sk_socket, .msg = msg, .size = size, .flags = flags }; return unix_stream_read_generic(&state, true); } static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sock, .msg = msg, .size = size, .flags = flags }; #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t size, unsigned int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_splice_actor, .socket = sock, .pipe = pipe, .size = size, .splice_flags = flags, }; if (unlikely(*ppos)) return -ESPIPE; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; struct sock *other; if (mode < SHUT_RD || mode > SHUT_RDWR) return -EINVAL; /* This maps: * SHUT_RD (0) -> RCV_SHUTDOWN (1) * SHUT_WR (1) -> SEND_SHUTDOWN (2) * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) */ ++mode; unix_state_lock(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); unix_state_unlock(sk); sk->sk_state_change(sk); if (other && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); if (prot->unhash) prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } if (other) sock_put(other); return 0; } long unix_inq_len(struct sock *sk) { struct sk_buff *skb; long amount = 0; if (sk->sk_state == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; } spin_unlock(&sk->sk_receive_queue.lock); return amount; } EXPORT_SYMBOL_GPL(unix_inq_len); long unix_outq_len(struct sock *sk) { return sk_wmem_alloc_get(sk); } EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { struct path path; struct file *f; int fd; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; path = unix_sk(sk)->path; if (!path.dentry) return -ENOENT; path_get(&path); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out; f = dentry_open(&path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); goto out; } fd_install(fd, f); out: path_put(&path); return fd; } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount = 0; int err; switch (cmd) { case SIOCOUTQ: amount = unix_outq_len(sk); err = put_user(amount, (int __user *)arg); break; case SIOCINQ: amount = unix_inq_len(sk); if (amount < 0) err = amount; else err = put_user(amount, (int __user *)arg); break; case SIOCUNIXFILE: err = unix_open_file(sk); break; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { struct sk_buff *skb; int answ = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) answ = 1; err = put_user(answ, (int __user *)arg); } break; #endif default: err = -ENOIOCTLCMD; break; } return err; } #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (READ_ONCE(unix_sk(sk)->oob_skb)) mask |= EPOLLPRI; #endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; unsigned int writable; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET) { if (sk->sk_state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ if (sk->sk_state == TCP_SYN_SENT) return mask; } /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); if (writable) { unix_state_lock(sk); other = unix_peer(sk); if (other && unix_peer(other) != sk && unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; unix_state_unlock(sk); } if (writable) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } #ifdef CONFIG_PROC_FS #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); unsigned long count = 0; struct sock *sk; for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); sk; sk = sk_next(sk)) { if (++count == offset) break; } return sk; } static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; spin_unlock(&net->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } return NULL; } static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); sk = sk_next(sk); if (sk) return sk; spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); return unix_get_first(seq, pos); } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (v == SEQ_START_TOKEN) return unix_get_first(seq, pos); return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); unix_state_lock(s); seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", s, refcount_read(&s->sk_refcnt), 0, s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->sk_type, s->sk_socket ? (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); if (u->addr->name->sun_path[0]) { len--; } else { seq_putc(seq, '@'); i++; } for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i] ?: '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, .show = unix_seq_show, }; #ifdef CONFIG_BPF_SYSCALL struct bpf_unix_iter_state { struct seq_net_private p; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; struct sock **batch; bool st_bucket_done; }; struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); uid_t uid __aligned(8); }; static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) { struct bpf_iter__unix ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.unix_sk = unix_sk; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; } expected++; } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); return expected; } static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) { while (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, unsigned int new_batch_sz) { struct sock **new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER | __GFP_NOWARN); if (!new_batch) return -ENOMEM; bpf_iter_unix_put_batch(iter); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } static struct sock *bpf_iter_unix_batch(struct seq_file *seq, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected; bool resized = false; struct sock *sk; if (iter->st_bucket_done) *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); again: /* Get a new batch */ iter->cur_sk = 0; iter->end_sk = 0; sk = unix_get_first(seq, pos); if (!sk) return NULL; /* Done */ expected = bpf_iter_unix_hold_batch(seq, sk); if (iter->end_sk == expected) { iter->st_bucket_done = true; return sk; } if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { resized = true; goto again; } return sk; } static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ return bpf_iter_unix_batch(seq, pos); } static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so advance to the next sk in * the batch. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); ++*pos; if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk]; else sk = bpf_iter_unix_batch(seq, pos); return sk; } static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; bool slow; int ret; if (v == SEQ_START_TOKEN) return 0; slow = lock_sock_fast(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: unlock_sock_fast(sk, slow); return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)unix_prog_seq_show(prog, &meta, v, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { .start = bpf_iter_unix_seq_start, .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; #endif #endif static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, }; static int __net_init unix_net_init(struct net *net) { int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) goto out; #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, sizeof(struct seq_net_private))) goto err_sysctl; #endif net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, sizeof(spinlock_t), GFP_KERNEL); if (!net->unx.table.locks) goto err_proc; net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->unx.table.buckets) goto free_locks; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock_init(&net->unx.table.locks[i]); INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } return 0; free_locks: kvfree(net->unx.table.locks); err_proc: #ifdef CONFIG_PROC_FS remove_proc_entry("unix", net->proc_net); err_sysctl: #endif unix_sysctl_unregister(net); out: return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { kvfree(net->unx.table.buckets); kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) #define INIT_BATCH_SZ 16 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_unix_iter_state *iter = priv_data; int err; err = bpf_iter_init_seq_net(priv_data, aux); if (err) return err; err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); if (err) { bpf_iter_fini_seq_net(priv_data); return err; } return 0; } static void bpf_iter_fini_unix(void *priv_data) { struct bpf_unix_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, .init_seq_private = bpf_iter_init_unix, .fini_seq_private = bpf_iter_fini_unix, .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static const struct bpf_func_proto * bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sk_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sk_getsockopt_proto; default: return NULL; } } static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; static void __init bpf_iter_register(void) { unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; if (bpf_iter_reg_target(&unix_reg_info)) pr_warn("Warning: could not register bpf iterator unix\n"); } #endif static int __init af_unix_init(void) { int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { spin_lock_init(&bsd_socket_locks[i]); INIT_HLIST_HEAD(&bsd_socket_buckets[i]); } rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; } rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); proto_unregister(&unix_dgram_proto); goto out; } sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif out: return rc; } /* Later than subsys_initcall() because we depend on stuff initialised there */ fs_initcall(af_unix_init); |
| 175 175 175 175 127 127 39 79 93 79 209 186 184 186 209 207 207 207 207 208 23 1 7 1 245 207 207 207 207 1873 29 29 2 2 2 2 6 6 162 162 162 162 162 162 61 176 162 411 411 173 162 61 326 161 165 165 4 4 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 | // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <linux/ethtool.h> #include "ipvlan.h" static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan; unsigned int flags; int err; ASSERT_RTNL(); if (port->mode != nval) { list_for_each_entry(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { err = dev_change_flags(ipvlan->dev, flags | IFF_NOARP, extack); } else { err = dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, extack); } if (unlikely(err)) goto fail; } if (nval == IPVLAN_MODE_L3S) { /* New mode is L3S */ err = ipvlan_l3s_register(port); if (err) goto fail; } else if (port->mode == IPVLAN_MODE_L3S) { /* Old mode was L3S */ ipvlan_l3s_unregister(port); } port->mode = nval; } return 0; fail: /* Undo the flags changes that have been done so far. */ list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (port->mode == IPVLAN_MODE_L3 || port->mode == IPVLAN_MODE_L3S) dev_change_flags(ipvlan->dev, flags | IFF_NOARP, NULL); else dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP, NULL); } return err; } static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; int err, idx; port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); if (!port) return -ENOMEM; write_pnet(&port->pnet, dev_net(dev)); port->dev = dev; port->mode = IPVLAN_MODE_L3; INIT_LIST_HEAD(&port->ipvlans); for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) INIT_HLIST_HEAD(&port->hlhead[idx]); skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); port->dev_id_start = 1; err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) goto err; netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; err: kfree(port); return err; } static void ipvlan_port_destroy(struct net_device *dev) { struct ipvl_port *port = ipvlan_port_get_rtnl(dev); struct sk_buff *skb; netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); while ((skb = __skb_dequeue(&port->backlog)) != NULL) { dev_put(skb->dev); kfree_skb(skb); } ida_destroy(&port->ida); kfree(port); } #define IPVLAN_ALWAYS_ON_OFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) #define IPVLAN_ALWAYS_ON \ (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) #define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */ #define IPVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static int ipvlan_init(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; int err; dev->state = (dev->state & ~IPVLAN_STATE_MASK) | (phy_dev->state & IPVLAN_STATE_MASK); dev->features = phy_dev->features & IPVLAN_FEATURES; dev->features |= IPVLAN_ALWAYS_ON; dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; netif_inherit_tso_max(dev, phy_dev); dev->hard_header_len = phy_dev->hard_header_len; netdev_lockdep_set_classes(dev); ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; if (!netif_is_ipvlan_port(phy_dev)) { err = ipvlan_port_create(phy_dev); if (err < 0) { free_percpu(ipvlan->pcpu_stats); return err; } } port = ipvlan_port_get_rtnl(phy_dev); port->count += 1; return 0; } static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_port *port; free_percpu(ipvlan->pcpu_stats); port = ipvlan_port_get_rtnl(phy_dev); port->count -= 1; if (!port->count) ipvlan_port_destroy(port->dev); } static int ipvlan_open(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr; if (ipvlan->port->mode == IPVLAN_MODE_L3 || ipvlan->port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); rcu_read_unlock(); return 0; } static int ipvlan_stop(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; struct ipvl_addr *addr; dev_uc_unsync(phy_dev, dev); dev_mc_unsync(phy_dev, dev); rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_del(addr); rcu_read_unlock(); return 0; } static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); int skblen = skb->len; int ret; ret = ipvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->tx_pkts); u64_stats_add(&pcptr->tx_bytes, skblen); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->tx_drps); } return ret; } static netdev_features_t ipvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct ipvl_dev *ipvlan = netdev_priv(dev); features |= NETIF_F_ALL_FOR_ALL; features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES); features = netdev_increment_features(ipvlan->phy_dev->features, features, features); features |= IPVLAN_ALWAYS_ON; features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON); return features; } static void ipvlan_change_rx_flags(struct net_device *dev, int change) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); } static void ipvlan_set_multicast_mac_filter(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); } else { struct netdev_hw_addr *ha; DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); netdev_for_each_mc_addr(ha, dev) __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); /* Turn-on broadcast bit irrespective of address family, * since broadcast is deferred to a work-queue, hence no * impact on fast-path processing. */ __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters); bitmap_copy(ipvlan->mac_filters, mc_filters, IPVLAN_MAC_FILTER_SIZE); } dev_uc_sync(ipvlan->phy_dev, dev); dev_mc_sync(ipvlan->phy_dev, dev); } static void ipvlan_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (ipvlan->pcpu_stats) { struct ipvl_pcpu_stats *pcptr; u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; u32 rx_errs = 0, tx_drps = 0; u32 strt; int idx; for_each_possible_cpu(idx) { pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); do { strt = u64_stats_fetch_begin(&pcptr->syncp); rx_pkts = u64_stats_read(&pcptr->rx_pkts); rx_bytes = u64_stats_read(&pcptr->rx_bytes); rx_mcast = u64_stats_read(&pcptr->rx_mcast); tx_pkts = u64_stats_read(&pcptr->tx_pkts); tx_bytes = u64_stats_read(&pcptr->tx_bytes); } while (u64_stats_fetch_retry(&pcptr->syncp, strt)); s->rx_packets += rx_pkts; s->rx_bytes += rx_bytes; s->multicast += rx_mcast; s->tx_packets += tx_pkts; s->tx_bytes += tx_bytes; /* u32 values are updated without syncp protection. */ rx_errs += READ_ONCE(pcptr->rx_errs); tx_drps += READ_ONCE(pcptr->tx_drps); } s->rx_errors = rx_errs; s->rx_dropped = rx_errs; s->tx_dropped = tx_drps; } s->tx_errors = DEV_STATS_READ(dev, tx_errors); } static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; return vlan_vid_add(phy_dev, proto, vid); } static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; vlan_vid_del(phy_dev, proto, vid); return 0; } static int ipvlan_get_iflink(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return ipvlan->phy_dev->ifindex; } static const struct net_device_ops ipvlan_netdev_ops = { .ndo_init = ipvlan_init, .ndo_uninit = ipvlan_uninit, .ndo_open = ipvlan_open, .ndo_stop = ipvlan_stop, .ndo_start_xmit = ipvlan_start_xmit, .ndo_fix_features = ipvlan_fix_features, .ndo_change_rx_flags = ipvlan_change_rx_flags, .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, .ndo_get_stats64 = ipvlan_get_stats64, .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, .ndo_get_iflink = ipvlan_get_iflink, }; static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct net_device *phy_dev = ipvlan->phy_dev; /* TODO Probably use a different field than dev_addr so that the * mac-address on the virtual device is portable and can be carried * while the packets use the mac-addr on the physical device. */ return dev_hard_header(skb, phy_dev, type, daddr, saddr ? : phy_dev->dev_addr, len); } static const struct header_ops ipvlan_header_ops = { .create = ipvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) { ipvlan->dev->mtu = dev->mtu; } static bool netif_is_ipvlan(const struct net_device *dev) { /* both ipvlan and ipvtap devices use the same netdev_ops */ return dev->netdev_ops == &ipvlan_netdev_ops; } static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd); } static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); strscpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); } static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); return ipvlan->msg_enable; } static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) { struct ipvl_dev *ipvlan = netdev_priv(dev); ipvlan->msg_enable = value; } static const struct ethtool_ops ipvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = ipvlan_ethtool_get_link_ksettings, .get_drvinfo = ipvlan_ethtool_get_drvinfo, .get_msglevel = ipvlan_ethtool_get_msglevel, .set_msglevel = ipvlan_ethtool_set_msglevel, }; static int ipvlan_nl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int err = 0; if (!data) return 0; if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, nmode, extack); } if (!err && data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (flags & IPVLAN_F_PRIVATE) ipvlan_mark_private(port); else ipvlan_clear_private(port); if (flags & IPVLAN_F_VEPA) ipvlan_mark_vepa(port); else ipvlan_clear_vepa(port); } return err; } static size_t ipvlan_nl_getsize(const struct net_device *dev) { return (0 + nla_total_size(2) /* IFLA_IPVLAN_MODE */ + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */ ); } static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return 0; if (data[IFLA_IPVLAN_MODE]) { u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); if (mode >= IPVLAN_MODE_MAX) return -EINVAL; } if (data[IFLA_IPVLAN_FLAGS]) { u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); /* Only two bits are used at this moment. */ if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; /* Also both flags can't be active at the same time. */ if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) == (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) return -EINVAL; } return 0; } static int ipvlan_nl_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); int ret = -EINVAL; if (!port) goto err; ret = -EMSGSIZE; if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) goto err; if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags)) goto err; return 0; err: return ret; } int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port; struct net_device *phy_dev; int err; u16 mode = IPVLAN_MODE_L3; if (!tb[IFLA_LINK]) return -EINVAL; phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!phy_dev) return -ENODEV; if (netif_is_ipvlan(phy_dev)) { struct ipvl_dev *tmp = netdev_priv(phy_dev); phy_dev = tmp->phy_dev; if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; } else if (!netif_is_ipvlan_port(phy_dev)) { /* Exit early if the underlying link is invalid or busy */ if (phy_dev->type != ARPHRD_ETHER || phy_dev->flags & IFF_LOOPBACK) { netdev_err(phy_dev, "Master is either lo or non-ether device\n"); return -EINVAL; } if (netdev_is_rx_handler_busy(phy_dev)) { netdev_err(phy_dev, "Device is already in use.\n"); return -EBUSY; } } ipvlan->phy_dev = phy_dev; ipvlan->dev = dev; ipvlan->sfeatures = IPVLAN_FEATURES; if (!tb[IFLA_MTU]) ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); spin_lock_init(&ipvlan->addrs_lock); /* TODO Probably put random address here to be presented to the * world but keep using the physical-dev address for the outgoing * packets. */ eth_hw_addr_set(dev, phy_dev->dev_addr); dev->priv_flags |= IFF_NO_RX_HANDLER; err = register_netdevice(dev); if (err < 0) return err; /* ipvlan_init() would have created the port, if required */ port = ipvlan_port_get_rtnl(phy_dev); ipvlan->port = port; /* If the port-id base is at the MAX value, then wrap it around and * begin from 0x1 again. This may be due to a busy system where lots * of slaves are getting created and deleted. */ if (port->dev_id_start == 0xFFFE) port->dev_id_start = 0x1; /* Since L2 address is shared among all IPvlan slaves including * master, use unique 16 bit dev-ids to differentiate among them. * Assign IDs between 0x1 and 0xFFFE (used by the master) to each * slave link [see addrconf_ifid_eui48()]. */ err = ida_alloc_range(&port->ida, port->dev_id_start, 0xFFFD, GFP_KERNEL); if (err < 0) err = ida_alloc_range(&port->ida, 0x1, port->dev_id_start - 1, GFP_KERNEL); if (err < 0) goto unregister_netdev; dev->dev_id = err; /* Increment id-base to the next slot for the future assignment */ port->dev_id_start = err + 1; err = netdev_upper_dev_link(phy_dev, dev, extack); if (err) goto remove_ida; /* Flags are per port and latest update overrides. User has * to be consistent in setting it just like the mode attribute. */ if (data && data[IFLA_IPVLAN_FLAGS]) port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); if (data && data[IFLA_IPVLAN_MODE]) mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); err = ipvlan_set_port_mode(port, mode, extack); if (err) goto unlink_netdev; list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); netif_stacked_transfer_operstate(phy_dev, dev); return 0; unlink_netdev: netdev_upper_dev_unlink(phy_dev, dev); remove_ida: ida_free(&port->ida, dev->dev_id); unregister_netdev: unregister_netdevice(dev); return err; } EXPORT_SYMBOL_GPL(ipvlan_link_new); void ipvlan_link_delete(struct net_device *dev, struct list_head *head) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); kfree_rcu(addr, rcu); } spin_unlock_bh(&ipvlan->addrs_lock); ida_free(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(ipvlan->phy_dev, dev); } EXPORT_SYMBOL_GPL(ipvlan_link_delete); void ipvlan_link_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; dev->netdev_ops = &ipvlan_netdev_ops; dev->needs_free_netdev = true; dev->header_ops = &ipvlan_header_ops; dev->ethtool_ops = &ipvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(ipvlan_link_setup); static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = { [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 }, }; static struct net *ipvlan_get_link_net(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); return dev_net(ipvlan->phy_dev); } static struct rtnl_link_ops ipvlan_link_ops = { .kind = "ipvlan", .priv_size = sizeof(struct ipvl_dev), .setup = ipvlan_link_setup, .newlink = ipvlan_link_new, .dellink = ipvlan_link_delete, .get_link_net = ipvlan_get_link_net, }; int ipvlan_link_register(struct rtnl_link_ops *ops) { ops->get_size = ipvlan_nl_getsize; ops->policy = ipvlan_nl_policy; ops->validate = ipvlan_nl_validate; ops->fill_info = ipvlan_nl_fillinfo; ops->changelink = ipvlan_nl_changelink; ops->maxtype = IFLA_IPVLAN_MAX; return rtnl_link_register(ops); } EXPORT_SYMBOL_GPL(ipvlan_link_register); static int ipvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvl_dev *ipvlan, *next; struct ipvl_port *port; LIST_HEAD(lst_kill); int err; if (!netif_is_ipvlan_port(dev)) return NOTIFY_DONE; port = ipvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) netif_stacked_transfer_operstate(ipvlan->phy_dev, ipvlan->dev); break; case NETDEV_REGISTER: { struct net *oldnet, *newnet = dev_net(dev); oldnet = read_pnet(&port->pnet); if (net_eq(newnet, oldnet)) break; write_pnet(&port->pnet, newnet); if (port->mode == IPVLAN_MODE_L3S) ipvlan_migrate_l3s_hook(oldnet, newnet); break; } case NETDEV_UNREGISTER: if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode) ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, &lst_kill); unregister_netdevice_many(&lst_kill); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { netif_inherit_tso_max(ipvlan->dev, dev); netdev_update_features(ipvlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(ipvlan, &port->ipvlans, pnode) ipvlan_adjust_mtu(ipvlan, dev); break; case NETDEV_PRE_CHANGEADDR: prechaddr_info = ptr; list_for_each_entry(ipvlan, &port->ipvlans, pnode) { err = dev_pre_changeaddr_notify(ipvlan->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); } break; case NETDEV_CHANGEADDR: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { eth_hw_addr_set(ipvlan->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); } break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; } return NOTIFY_DONE; } /* the caller must held the addrs lock */ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); if (!addr) return -ENOMEM; addr->master = ipvlan; if (!is_v6) { memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); addr->atype = IPVL_IPV4; #if IS_ENABLED(CONFIG_IPV6) } else { memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); addr->atype = IPVL_IPV6; #endif } list_add_tail_rcu(&addr->anode, &ipvlan->addrs); /* If the interface is not up, the address will be added to the hash * list by ipvlan_open. */ if (netif_running(ipvlan->dev)) ipvlan_ht_addr_add(ipvlan, addr); return 0; } static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; spin_lock_bh(&ipvlan->addrs_lock); addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); if (!addr) { spin_unlock_bh(&ipvlan->addrs_lock); return; } ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); spin_unlock_bh(&ipvlan->addrs_lock); kfree_rcu(addr, rcu); } static bool ipvlan_is_valid_dev(const struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); if (!netif_is_ipvlan(dev)) return false; if (!ipvlan || !ipvlan->port) return false; return true; } #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip6_addr, true); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { return ipvlan_del_addr(ipvlan, ip6_addr, true); } static int ipvlan_addr6_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if6->idev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_add_addr6(ipvlan, &if6->addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ipvlan_del_addr6(ipvlan, &if6->addr); break; } return NOTIFY_OK; } static int ipvlan_addr6_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { NL_SET_ERR_MSG(i6vi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } #endif static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { int ret = -EINVAL; spin_lock_bh(&ipvlan->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip4_addr, false); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { return ipvlan_del_addr(ipvlan, ip4_addr, false); } static int ipvlan_addr4_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); struct in_addr ip4_addr; if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: ip4_addr.s_addr = if4->ifa_address; if (ipvlan_add_addr4(ipvlan, &ip4_addr)) return NOTIFY_BAD; break; case NETDEV_DOWN: ip4_addr.s_addr = if4->ifa_address; ipvlan_del_addr4(ipvlan, &ip4_addr); break; } return NOTIFY_OK; } static int ipvlan_addr4_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_validator_info *ivi = (struct in_validator_info *)ptr; struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { NL_SET_ERR_MSG(ivi->extack, "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); } break; } return NOTIFY_OK; } static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_event, }; static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr4_validator_event, }; static struct notifier_block ipvlan_notifier_block __read_mostly = { .notifier_call = ipvlan_device_event, }; #if IS_ENABLED(CONFIG_IPV6) static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_event, }; static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = { .notifier_call = ipvlan_addr6_validator_event, }; #endif static int __init ipvlan_init_module(void) { int err; ipvlan_init_secret(); register_netdevice_notifier(&ipvlan_notifier_block); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&ipvlan_addr6_notifier_block); register_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif register_inetaddr_notifier(&ipvlan_addr4_notifier_block); register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); err = ipvlan_l3s_init(); if (err < 0) goto error; err = ipvlan_link_register(&ipvlan_link_ops); if (err < 0) { ipvlan_l3s_cleanup(); goto error; } return 0; error: unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif unregister_netdevice_notifier(&ipvlan_notifier_block); return err; } static void __exit ipvlan_cleanup_module(void) { rtnl_link_unregister(&ipvlan_link_ops); ipvlan_l3s_cleanup(); unregister_netdevice_notifier(&ipvlan_notifier_block); unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); unregister_inetaddr_validator_notifier( &ipvlan_addr4_vtor_notifier_block); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif } module_init(ipvlan_init_module); module_exit(ipvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>"); MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); MODULE_ALIAS_RTNL_LINK("ipvlan"); |
| 3957 1763 1762 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */ |
| 108 108 108 108 108 108 108 108 108 108 108 108 108 108 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/dd.c - The core device/driver interactions. * * This file contains the (sometimes tricky) code that controls the * interactions between devices and drivers, which primarily includes * driver binding and unbinding. * * All of this code used to exist in drivers/base/bus.c, but was * relocated to here in the name of compartmentalization (since it wasn't * strictly code just for the 'struct bus_type'. * * Copyright (c) 2002-5 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007-2009 Novell Inc. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/delay.h> #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/async.h> #include <linux/pm_runtime.h> #include <linux/pinctrl/devinfo.h> #include <linux/slab.h> #include "base.h" #include "power/power.h" /* * Deferred Probe infrastructure. * * Sometimes driver probe order matters, but the kernel doesn't always have * dependency information which means some drivers will get probed before a * resource it depends on is available. For example, an SDHCI driver may * first need a GPIO line from an i2c GPIO controller before it can be * initialized. If a required resource is not available yet, a driver can * request probing to be deferred by returning -EPROBE_DEFER from its probe hook * * Deferred probe maintains two lists of devices, a pending list and an active * list. A driver returning -EPROBE_DEFER causes the device to be added to the * pending list. A successful driver probe will trigger moving all devices * from the pending to the active list so that the workqueue will eventually * retry them. * * The deferred_probe_mutex must be held any time the deferred_probe_*_list * of the (struct device*)->p->deferred_probe pointers are manipulated */ static DEFINE_MUTEX(deferred_probe_mutex); static LIST_HEAD(deferred_probe_pending_list); static LIST_HEAD(deferred_probe_active_list); static atomic_t deferred_trigger_count = ATOMIC_INIT(0); static bool initcalls_done; /* Save the async probe drivers' name from kernel cmdline */ #define ASYNC_DRV_NAMES_MAX_LEN 256 static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN]; static bool async_probe_default; /* * In some cases, like suspend to RAM or hibernation, It might be reasonable * to prohibit probing of devices as it could be unsafe. * Once defer_all_probes is true all drivers probes will be forcibly deferred. */ static bool defer_all_probes; static void __device_set_deferred_probe_reason(const struct device *dev, char *reason) { kfree(dev->p->deferred_probe_reason); dev->p->deferred_probe_reason = reason; } /* * deferred_probe_work_func() - Retry probing devices in the active list. */ static void deferred_probe_work_func(struct work_struct *work) { struct device *dev; struct device_private *private; /* * This block processes every device in the deferred 'active' list. * Each device is removed from the active list and passed to * bus_probe_device() to re-attempt the probe. The loop continues * until every device in the active list is removed and retried. * * Note: Once the device is removed from the list and the mutex is * released, it is possible for the device get freed by another thread * and cause a illegal pointer dereference. This code uses * get/put_device() to ensure the device structure cannot disappear * from under our feet. */ mutex_lock(&deferred_probe_mutex); while (!list_empty(&deferred_probe_active_list)) { private = list_first_entry(&deferred_probe_active_list, typeof(*dev->p), deferred_probe); dev = private->device; list_del_init(&private->deferred_probe); get_device(dev); __device_set_deferred_probe_reason(dev, NULL); /* * Drop the mutex while probing each device; the probe path may * manipulate the deferred list */ mutex_unlock(&deferred_probe_mutex); /* * Force the device to the end of the dpm_list since * the PM code assumes that the order we add things to * the list is a good order for suspend but deferred * probe makes that very unsafe. */ device_pm_move_to_tail(dev); dev_dbg(dev, "Retrying from deferred list\n"); bus_probe_device(dev); mutex_lock(&deferred_probe_mutex); put_device(dev); } mutex_unlock(&deferred_probe_mutex); } static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func); void driver_deferred_probe_add(struct device *dev) { if (!dev->can_match) return; mutex_lock(&deferred_probe_mutex); if (list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Added to deferred list\n"); list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list); } mutex_unlock(&deferred_probe_mutex); } void driver_deferred_probe_del(struct device *dev) { mutex_lock(&deferred_probe_mutex); if (!list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Removed from deferred list\n"); list_del_init(&dev->p->deferred_probe); __device_set_deferred_probe_reason(dev, NULL); } mutex_unlock(&deferred_probe_mutex); } static bool driver_deferred_probe_enable; /** * driver_deferred_probe_trigger() - Kick off re-probing deferred devices * * This functions moves all devices from the pending list to the active * list and schedules the deferred probe workqueue to process them. It * should be called anytime a driver is successfully bound to a device. * * Note, there is a race condition in multi-threaded probe. In the case where * more than one device is probing at the same time, it is possible for one * probe to complete successfully while another is about to defer. If the second * depends on the first, then it will get put on the pending list after the * trigger event has already occurred and will be stuck there. * * The atomic 'deferred_trigger_count' is used to determine if a successful * trigger has occurred in the midst of probing a driver. If the trigger count * changes in the midst of a probe, then deferred processing should be triggered * again. */ void driver_deferred_probe_trigger(void) { if (!driver_deferred_probe_enable) return; /* * A successful probe means that all the devices in the pending list * should be triggered to be reprobed. Move all the deferred devices * into the active list so they can be retried by the workqueue */ mutex_lock(&deferred_probe_mutex); atomic_inc(&deferred_trigger_count); list_splice_tail_init(&deferred_probe_pending_list, &deferred_probe_active_list); mutex_unlock(&deferred_probe_mutex); /* * Kick the re-probe thread. It may already be scheduled, but it is * safe to kick it again. */ queue_work(system_unbound_wq, &deferred_probe_work); } /** * device_block_probing() - Block/defer device's probes * * It will disable probing of devices and defer their probes instead. */ void device_block_probing(void) { defer_all_probes = true; /* sync with probes to avoid races. */ wait_for_device_probe(); } /** * device_unblock_probing() - Unblock/enable device's probes * * It will restore normal behavior and trigger re-probing of deferred * devices. */ void device_unblock_probing(void) { defer_all_probes = false; driver_deferred_probe_trigger(); } /** * device_set_deferred_probe_reason() - Set defer probe reason message for device * @dev: the pointer to the struct device * @vaf: the pointer to va_format structure with message */ void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf) { const char *drv = dev_driver_string(dev); char *reason; mutex_lock(&deferred_probe_mutex); reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf); __device_set_deferred_probe_reason(dev, reason); mutex_unlock(&deferred_probe_mutex); } /* * deferred_devs_show() - Show the devices in the deferred probe pending list. */ static int deferred_devs_show(struct seq_file *s, void *data) { struct device_private *curr; mutex_lock(&deferred_probe_mutex); list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe) seq_printf(s, "%s\t%s", dev_name(curr->device), curr->device->p->deferred_probe_reason ?: "\n"); mutex_unlock(&deferred_probe_mutex); return 0; } DEFINE_SHOW_ATTRIBUTE(deferred_devs); #ifdef CONFIG_MODULES static int driver_deferred_probe_timeout = 10; #else static int driver_deferred_probe_timeout; #endif static int __init deferred_probe_timeout_setup(char *str) { int timeout; if (!kstrtoint(str, 10, &timeout)) driver_deferred_probe_timeout = timeout; return 1; } __setup("deferred_probe_timeout=", deferred_probe_timeout_setup); /** * driver_deferred_probe_check_state() - Check deferred probe state * @dev: device to check * * Return: * * -ENODEV if initcalls have completed and modules are disabled. * * -ETIMEDOUT if the deferred probe timeout was set and has expired * and modules are enabled. * * -EPROBE_DEFER in other cases. * * Drivers or subsystems can opt-in to calling this function instead of directly * returning -EPROBE_DEFER. */ int driver_deferred_probe_check_state(struct device *dev) { if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) { dev_warn(dev, "ignoring dependency for device, assuming no driver\n"); return -ENODEV; } if (!driver_deferred_probe_timeout && initcalls_done) { dev_warn(dev, "deferred probe timeout, ignoring dependency\n"); return -ETIMEDOUT; } return -EPROBE_DEFER; } EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state); static void deferred_probe_timeout_work_func(struct work_struct *work) { struct device_private *p; fw_devlink_drivers_done(); driver_deferred_probe_timeout = 0; driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) dev_info(p->device, "deferred probe pending\n"); mutex_unlock(&deferred_probe_mutex); fw_devlink_probing_done(); } static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func); void deferred_probe_extend_timeout(void) { /* * If the work hasn't been queued yet or if the work expired, don't * start a new one. */ if (cancel_delayed_work(&deferred_probe_timeout_work)) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); pr_debug("Extended deferred probe timeout by %d secs\n", driver_deferred_probe_timeout); } } /** * deferred_probe_initcall() - Enable probing of deferred devices * * We don't want to get in the way when the bulk of drivers are getting probed. * Instead, this initcall makes sure that deferred probing is delayed until * late_initcall time. */ static int deferred_probe_initcall(void) { debugfs_create_file("devices_deferred", 0444, NULL, NULL, &deferred_devs_fops); driver_deferred_probe_enable = true; driver_deferred_probe_trigger(); /* Sort as many dependencies as possible before exiting initcalls */ flush_work(&deferred_probe_work); initcalls_done = true; if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_drivers_done(); /* * Trigger deferred probe again, this time we won't defer anything * that is optional */ driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); if (driver_deferred_probe_timeout > 0) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); } if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_probing_done(); return 0; } late_initcall(deferred_probe_initcall); static void __exit deferred_probe_exit(void) { debugfs_lookup_and_remove("devices_deferred", NULL); } __exitcall(deferred_probe_exit); /** * device_is_bound() - Check if device is bound to a driver * @dev: device to check * * Returns true if passed device has already finished probing successfully * against a driver. * * This function must be called with the device lock held. */ bool device_is_bound(struct device *dev) { return dev->p && klist_node_attached(&dev->p->knode_driver); } static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { pr_warn("%s: device %s already bound\n", __func__, kobject_name(&dev->kobj)); return; } pr_debug("driver: '%s': %s: bound to device '%s'\n", dev->driver->name, __func__, dev_name(dev)); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); device_pm_check_callbacks(dev); /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices */ driver_deferred_probe_del(dev); driver_deferred_probe_trigger(); bus_notify(dev, BUS_NOTIFY_BOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_BIND); } static ssize_t coredump_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { device_lock(dev); dev->driver->coredump(dev); device_unlock(dev); return count; } static DEVICE_ATTR_WO(coredump); static int driver_sysfs_add(struct device *dev) { int ret; bus_notify(dev, BUS_NOTIFY_BIND_DRIVER); ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj, kobject_name(&dev->kobj)); if (ret) goto fail; ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj, "driver"); if (ret) goto rm_dev; if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump) return 0; ret = device_create_file(dev, &dev_attr_coredump); if (!ret) return 0; sysfs_remove_link(&dev->kobj, "driver"); rm_dev: sysfs_remove_link(&dev->driver->p->kobj, kobject_name(&dev->kobj)); fail: return ret; } static void driver_sysfs_remove(struct device *dev) { struct device_driver *drv = dev->driver; if (drv) { if (drv->coredump) device_remove_file(dev, &dev_attr_coredump); sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj)); sysfs_remove_link(&dev->kobj, "driver"); } } /** * device_bind_driver - bind a driver to one device. * @dev: device. * * Allow manual attachment of a driver to a device. * Caller must have already set @dev->driver. * * Note that this does not modify the bus reference count. * Please verify that is accounted for before calling this. * (It is ok to call with no other effort from a driver's probe() method.) * * This function must be called with the device lock held. * * Callers should prefer to use device_driver_attach() instead. */ int device_bind_driver(struct device *dev) { int ret; ret = driver_sysfs_add(dev); if (!ret) { device_links_force_bind(dev); driver_bound(dev); } else bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); return ret; } EXPORT_SYMBOL_GPL(device_bind_driver); static atomic_t probe_count = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue); static ssize_t state_synced_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret = 0; if (strcmp("1", buf)) return -EINVAL; device_lock(dev); if (!dev->state_synced) { dev->state_synced = true; dev_sync_state(dev); } else { ret = -EINVAL; } device_unlock(dev); return ret ? ret : count; } static ssize_t state_synced_show(struct device *dev, struct device_attribute *attr, char *buf) { bool val; device_lock(dev); val = dev->state_synced; device_unlock(dev); return sysfs_emit(buf, "%u\n", val); } static DEVICE_ATTR_RW(state_synced); static void device_unbind_cleanup(struct device *dev) { devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); dev_pm_set_driver_flags(dev, 0); } static void device_remove(struct device *dev) { device_remove_file(dev, &dev_attr_state_synced); device_remove_groups(dev, dev->driver->dev_groups); if (dev->bus && dev->bus->remove) dev->bus->remove(dev); else if (dev->driver->remove) dev->driver->remove(dev); } static int call_driver_probe(struct device *dev, struct device_driver *drv) { int ret = 0; if (dev->bus->probe) ret = dev->bus->probe(dev); else if (drv->probe) ret = drv->probe(dev); switch (ret) { case 0: break; case -EPROBE_DEFER: /* Driver requested deferred probing */ dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name); break; case -ENODEV: case -ENXIO: pr_debug("%s: probe of %s rejects match %d\n", drv->name, dev_name(dev), ret); break; default: /* driver matched but the probe failed */ pr_warn("%s: probe of %s failed with error %d\n", drv->name, dev_name(dev), ret); break; } return ret; } static int really_probe(struct device *dev, struct device_driver *drv) { bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) && !drv->suppress_bind_attrs; int ret, link_ret; if (defer_all_probes) { /* * Value of defer_all_probes can be set only by * device_block_probing() which, in turn, will call * wait_for_device_probe() right after that to avoid any races. */ dev_dbg(dev, "Driver %s force probe deferral\n", drv->name); return -EPROBE_DEFER; } link_ret = device_links_check_suppliers(dev); if (link_ret == -EPROBE_DEFER) return link_ret; pr_debug("bus: '%s': %s: probing driver %s with device %s\n", drv->bus->name, __func__, drv->name, dev_name(dev)); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; goto done; } re_probe: dev->driver = drv; /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); if (ret) goto pinctrl_bind_failed; if (dev->bus->dma_configure) { ret = dev->bus->dma_configure(dev); if (ret) goto pinctrl_bind_failed; } ret = driver_sysfs_add(dev); if (ret) { pr_err("%s: driver_sysfs_add(%s) failed\n", __func__, dev_name(dev)); goto sysfs_failed; } if (dev->pm_domain && dev->pm_domain->activate) { ret = dev->pm_domain->activate(dev); if (ret) goto probe_failed; } ret = call_driver_probe(dev, drv); if (ret) { /* * If fw_devlink_best_effort is active (denoted by -EAGAIN), the * device might actually probe properly once some of its missing * suppliers have probed. So, treat this as if the driver * returned -EPROBE_DEFER. */ if (link_ret == -EAGAIN) ret = -EPROBE_DEFER; /* * Return probe errors as positive values so that the callers * can distinguish them from other errors. */ ret = -ret; goto probe_failed; } ret = device_add_groups(dev, drv->dev_groups); if (ret) { dev_err(dev, "device_add_groups() failed\n"); goto dev_groups_failed; } if (dev_has_sync_state(dev)) { ret = device_create_file(dev, &dev_attr_state_synced); if (ret) { dev_err(dev, "state_synced sysfs add failed\n"); goto dev_sysfs_state_synced_failed; } } if (test_remove) { test_remove = false; device_remove(dev); driver_sysfs_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); goto re_probe; } pinctrl_init_done(dev); if (dev->pm_domain && dev->pm_domain->sync) dev->pm_domain->sync(dev); driver_bound(dev); pr_debug("bus: '%s': %s: bound device %s to driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); goto done; dev_sysfs_state_synced_failed: dev_groups_failed: device_remove(dev); probe_failed: driver_sysfs_remove(dev); sysfs_failed: bus_notify(dev, BUS_NOTIFY_DRIVER_NOT_BOUND); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); pinctrl_bind_failed: device_links_no_driver(dev); device_unbind_cleanup(dev); done: return ret; } /* * For initcall_debug, show the driver probe time. */ static int really_probe_debug(struct device *dev, struct device_driver *drv) { ktime_t calltime, rettime; int ret; calltime = ktime_get(); ret = really_probe(dev, drv); rettime = ktime_get(); /* * Don't change this to pr_debug() because that requires * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the * kernel commandline to print this all the time at the debug level. */ printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n", dev_name(dev), ret, ktime_us_delta(rettime, calltime)); return ret; } /** * driver_probe_done * Determine if the probe sequence is finished or not. * * Should somehow figure out how to use a semaphore, not an atomic variable... */ bool __init driver_probe_done(void) { int local_probe_count = atomic_read(&probe_count); pr_debug("%s: probe_count = %d\n", __func__, local_probe_count); return !local_probe_count; } /** * wait_for_device_probe * Wait for device probing to be completed. */ void wait_for_device_probe(void) { /* wait for the deferred probe workqueue to finish */ flush_work(&deferred_probe_work); /* wait for the known devices to complete their probing */ wait_event(probe_waitqueue, atomic_read(&probe_count) == 0); async_synchronize_full(); } EXPORT_SYMBOL_GPL(wait_for_device_probe); static int __driver_probe_device(struct device_driver *drv, struct device *dev) { int ret = 0; if (dev->p->dead || !device_is_registered(dev)) return -ENODEV; if (dev->driver) return -EBUSY; dev->can_match = true; pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); if (initcall_debug) ret = really_probe_debug(dev, drv); else ret = really_probe(dev, drv); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); pm_runtime_put_suppliers(dev); return ret; } /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to * @dev: device to try to bind to the driver * * This function returns -ENODEV if the device is not registered, -EBUSY if it * already has a driver, 0 if the device is bound successfully and a positive * (inverted) error code for failures from the ->probe method. * * This function must be called with @dev lock held. When called for a * USB interface, @dev->parent lock must be held as well. * * If the device has a parent, runtime-resume the parent before driver probing. */ static int driver_probe_device(struct device_driver *drv, struct device *dev) { int trigger_count = atomic_read(&deferred_trigger_count); int ret; atomic_inc(&probe_count); ret = __driver_probe_device(drv, dev); if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) { driver_deferred_probe_add(dev); /* * Did a trigger occur while probing? Need to re-trigger if yes */ if (trigger_count != atomic_read(&deferred_trigger_count) && !defer_all_probes) driver_deferred_probe_trigger(); } atomic_dec(&probe_count); wake_up_all(&probe_waitqueue); return ret; } static inline bool cmdline_requested_async_probing(const char *drv_name) { bool async_drv; async_drv = parse_option_str(async_probe_drv_names, drv_name); return (async_probe_default != async_drv); } /* The option format is "driver_async_probe=drv_name1,drv_name2,..." */ static int __init save_async_options(char *buf) { if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN) pr_warn("Too long list of driver names for 'driver_async_probe'!\n"); strscpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN); async_probe_default = parse_option_str(async_probe_drv_names, "*"); return 1; } __setup("driver_async_probe=", save_async_options); static bool driver_allows_async_probing(struct device_driver *drv) { switch (drv->probe_type) { case PROBE_PREFER_ASYNCHRONOUS: return true; case PROBE_FORCE_SYNCHRONOUS: return false; default: if (cmdline_requested_async_probing(drv->name)) return true; if (module_requested_async_probing(drv->owner)) return true; return false; } } struct device_attach_data { struct device *dev; /* * Indicates whether we are considering asynchronous probing or * not. Only initial binding after device or driver registration * (including deferral processing) may be done asynchronously, the * rest is always synchronous, as we expect it is being done by * request from userspace. */ bool check_async; /* * Indicates if we are binding synchronous or asynchronous drivers. * When asynchronous probing is enabled we'll execute 2 passes * over drivers: first pass doing synchronous probing and second * doing asynchronous probing (if synchronous did not succeed - * most likely because there was no driver requiring synchronous * probing - and we found asynchronous driver during first pass). * The 2 passes are done because we can't shoot asynchronous * probe for given device and driver from bus_for_each_drv() since * driver pointer is not guaranteed to stay valid once * bus_for_each_drv() iterates to the next driver on the bus. */ bool want_async; /* * We'll set have_async to 'true' if, while scanning for matching * driver, we'll encounter one that requests asynchronous probing. */ bool have_async; }; static int __device_attach_driver(struct device_driver *drv, void *_data) { struct device_attach_data *data = _data; struct device *dev = data->dev; bool async_allowed; int ret; ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Device can't match with a driver right now, so don't attempt * to match or bind with other drivers on the bus. */ return ret; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); return ret; } /* ret > 0 means positive match */ async_allowed = driver_allows_async_probing(drv); if (async_allowed) data->have_async = true; if (data->check_async && async_allowed != data->want_async) return 0; /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = driver_probe_device(drv, dev); if (ret < 0) return ret; return ret == 0; } static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_attach_data data = { .dev = dev, .check_async = true, .want_async = true, }; device_lock(dev); /* * Check if device has already been removed or claimed. This may * happen with driver loading, device discovery/registration, * and deferred probe processing happens all at once with * multiple threads. */ if (dev->p->dead || dev->driver) goto out_unlock; if (dev->parent) pm_runtime_get_sync(dev->parent); bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); dev_dbg(dev, "async probe completed\n"); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); out_unlock: device_unlock(dev); put_device(dev); } static int __device_attach(struct device *dev, bool allow_async) { int ret = 0; bool async = false; device_lock(dev); if (dev->p->dead) { goto out_unlock; } else if (dev->driver) { if (device_is_bound(dev)) { ret = 1; goto out_unlock; } ret = device_bind_driver(dev); if (ret == 0) ret = 1; else { dev->driver = NULL; ret = 0; } } else { struct device_attach_data data = { .dev = dev, .check_async = allow_async, .want_async = false, }; if (dev->parent) pm_runtime_get_sync(dev->parent); ret = bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); if (!ret && allow_async && data.have_async) { /* * If we could not find appropriate driver * synchronously and we are allowed to do * async probes and there are drivers that * want to probe asynchronously, we'll * try them. */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); async = true; } else { pm_request_idle(dev); } if (dev->parent) pm_runtime_put(dev->parent); } out_unlock: device_unlock(dev); if (async) async_schedule_dev(__device_attach_async_helper, dev); return ret; } /** * device_attach - try to attach device to a driver. * @dev: device. * * Walk the list of drivers that the bus has and call * driver_probe_device() for each pair. If a compatible * pair is found, break out and return. * * Returns 1 if the device was bound to a driver; * 0 if no matching driver was found; * -ENODEV if the device is not registered. * * When called for a USB interface, @dev->parent lock must be held. */ int device_attach(struct device *dev) { return __device_attach(dev, false); } EXPORT_SYMBOL_GPL(device_attach); void device_initial_probe(struct device *dev) { __device_attach(dev, true); } /* * __device_driver_lock - acquire locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will take the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a USB * interface, @parent lock will be held as well. */ static void __device_driver_lock(struct device *dev, struct device *parent) { if (parent && dev->bus->need_parent_lock) device_lock(parent); device_lock(dev); } /* * __device_driver_unlock - release locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will release the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a * USB interface, @parent lock will be released as well. */ static void __device_driver_unlock(struct device *dev, struct device *parent) { device_unlock(dev); if (parent && dev->bus->need_parent_lock) device_unlock(parent); } /** * device_driver_attach - attach a specific driver to a specific device * @drv: Driver to attach * @dev: Device to attach it to * * Manually attach driver to a device. Will acquire both @dev lock and * @dev->parent lock if needed. Returns 0 on success, -ERR on failure. */ int device_driver_attach(struct device_driver *drv, struct device *dev) { int ret; __device_driver_lock(dev, dev->parent); ret = __driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); /* also return probe errors as normal negative errnos */ if (ret > 0) ret = -ret; if (ret == -EPROBE_DEFER) return -EAGAIN; return ret; } EXPORT_SYMBOL_GPL(device_driver_attach); static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_driver *drv; int ret; __device_driver_lock(dev, dev->parent); drv = dev->p->async_driver; dev->p->async_driver = NULL; ret = driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret); put_device(dev); } static int __driver_attach(struct device *dev, void *data) { struct device_driver *drv = data; bool async = false; int ret; /* * Lock device and try to bind to it. We drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. */ ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } /* ret > 0 means positive match */ if (driver_allows_async_probing(drv)) { /* * Instead of probing the device synchronously we will * probe it asynchronously to allow for more parallelism. * * We only take the device lock here in order to guarantee * that the dev->driver and async_driver fields are protected */ dev_dbg(dev, "probing driver %s asynchronously\n", drv->name); device_lock(dev); if (!dev->driver && !dev->p->async_driver) { get_device(dev); dev->p->async_driver = drv; async = true; } device_unlock(dev); if (async) async_schedule_dev(__driver_attach_async_helper, dev); return 0; } __device_driver_lock(dev, dev->parent); driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); return 0; } /** * driver_attach - try to bind driver to devices. * @drv: driver. * * Walk the list of devices that the bus has on it and try to * match the driver with each one. If driver_probe_device() * returns 0 and the @dev->driver is set, we've found a * compatible pair. */ int driver_attach(struct device_driver *drv) { return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach); } EXPORT_SYMBOL_GPL(driver_attach); /* * __device_release_driver() must be called with @dev lock held. * When called for a USB interface, @dev->parent lock must be held as well. */ static void __device_release_driver(struct device *dev, struct device *parent) { struct device_driver *drv; drv = dev->driver; if (drv) { pm_runtime_get_sync(dev); while (device_links_busy(dev)) { __device_driver_unlock(dev, parent); device_links_unbind_consumers(dev); __device_driver_lock(dev, parent); /* * A concurrent invocation of the same function might * have released the driver successfully while this one * was waiting, so check for that. */ if (dev->driver != drv) { pm_runtime_put(dev); return; } } driver_sysfs_remove(dev); bus_notify(dev, BUS_NOTIFY_UNBIND_DRIVER); pm_runtime_put_sync(dev); device_remove(dev); if (dev->bus && dev->bus->dma_cleanup) dev->bus->dma_cleanup(dev); device_unbind_cleanup(dev); device_links_driver_cleanup(dev); klist_remove(&dev->p->knode_driver); device_pm_check_callbacks(dev); bus_notify(dev, BUS_NOTIFY_UNBOUND_DRIVER); kobject_uevent(&dev->kobj, KOBJ_UNBIND); } } void device_release_driver_internal(struct device *dev, struct device_driver *drv, struct device *parent) { __device_driver_lock(dev, parent); if (!drv || drv == dev->driver) __device_release_driver(dev, parent); __device_driver_unlock(dev, parent); } /** * device_release_driver - manually detach device from driver. * @dev: device. * * Manually detach device from driver. * When called for a USB interface, @dev->parent lock must be held. * * If this function is to be called with @dev->parent lock held, ensure that * the device's consumers are unbound in advance or that their locks can be * acquired under the @dev->parent lock. */ void device_release_driver(struct device *dev) { /* * If anyone calls device_release_driver() recursively from * within their ->remove callback for the same device, they * will deadlock right here. */ device_release_driver_internal(dev, NULL, NULL); } EXPORT_SYMBOL_GPL(device_release_driver); /** * device_driver_detach - detach driver from a specific device * @dev: device to detach driver from * * Detach driver from device. Will acquire both @dev lock and @dev->parent * lock if needed. */ void device_driver_detach(struct device *dev) { device_release_driver_internal(dev, NULL, dev->parent); } /** * driver_detach - detach driver from all devices it controls. * @drv: driver. */ void driver_detach(struct device_driver *drv) { struct device_private *dev_prv; struct device *dev; if (driver_allows_async_probing(drv)) async_synchronize_full(); for (;;) { spin_lock(&drv->p->klist_devices.k_lock); if (list_empty(&drv->p->klist_devices.k_list)) { spin_unlock(&drv->p->klist_devices.k_lock); break; } dev_prv = list_last_entry(&drv->p->klist_devices.k_list, struct device_private, knode_driver.n_node); dev = dev_prv->device; get_device(dev); spin_unlock(&drv->p->klist_devices.k_lock); device_release_driver_internal(dev, drv, dev->parent); put_device(dev); } } |
| 1872 167 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.pf != NFPROTO_NETDEV || hook->ops.hooknum != NF_NETDEV_INGRESS) return false; dev = hook->ops.dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct net_device *dev; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { dev = hook->ops.dev; if (this_dev && this_dev != dev) continue; err = nft_chain_offload_cmd(basechain, dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { if (i-- <= 0) break; dev = hook->ops.dev; nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->ctx.flags & NLM_F_REPLACE || !(trans->ctx.flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev != dev) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); } |
| 6 6 1 391 391 391 1872 3 3 3 1 1 1 7 7 7 5 5 5 5 8 1 1 2 1 1 1 8 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 | // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.7.0 * * 070228 : Fix to allow multiple sessions with same remote MAC and same * session id by including the local device ifindex in the * tuple identifying a session. This also ensures packets can't * be injected into a session from interfaces other than the one * specified by userspace. Florian Zumbiehl <florz@florz.de> * (Oh, BTW, this one is YYMMDD, in case you were wondering ...) * 220102 : Fix module use count on failure in pppoe_create, pppox_sk -acme * 030700 : Fixed connect logic to allow for disconnect. * 270700 : Fixed potential SMP problems; we must protect against * simultaneous invocation of ppp_input * and ppp_unregister_channel. * 040800 : Respect reference count mechanisms on net-devices. * 200800 : fix kfree(skb) in pppoe_rcv (acme) * Module reference count is decremented in the right spot now, * guards against sock_put not actually freeing the sk * in pppoe_release. * 051000 : Initialization cleanup. * 111100 : Fix recvmsg. * 050101 : Fix PADT processing. * 140501 : Use pppoe_rcv_core to handle all backlog. (Alexey) * 170701 : Do not lock_sock with rwlock held. (DaveM) * Ignore discovery frames if user has socket * locked. (DaveM) * Ignore return value of dev_queue_xmit in __pppoe_xmit * or else we may kfree an SKB twice. (DaveM) * 190701 : When doing copies of skb's in __pppoe_xmit, always delete * the original skb that was passed in on success, never on * failure. Delete the copy of the skb on failure to avoid * a memory leak. * 081001 : Misc. cleanup (licence string, non-blocking, prevent * reference of device on close). * 121301 : New ppp channels interface; cannot unregister a channel * from interrupts. Thus, we mark the socket as a ZOMBIE * and do the unregistration later. * 081002 : seq_file support for proc stuff -acme * 111602 : Merge all 2.4 fixes into 2.5/2.6 tree. Label 2.5/2.6 * as version 0.7. Spacing cleanup. * Author: Michal Ostrowski <mostrows@speakeasy.net> * Contributors: * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * David S. Miller (davem@redhat.com) * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/if_pppox.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <linux/uaccess.h> #define PPPOE_HASH_BITS CONFIG_PPPOE_HASH_BITS #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS) #define PPPOE_HASH_MASK (PPPOE_HASH_SIZE - 1) static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); static const struct proto_ops pppoe_ops; static const struct ppp_channel_ops pppoe_chan_ops; /* per-net private data for this module */ static unsigned int pppoe_net_id __read_mostly; struct pppoe_net { /* * we could use _single_ hash table for all * nets by injecting net id into the hash but * it would increase hash chains and add * a few additional math comparisons messy * as well, moreover in case of SMP less locking * controversy here */ struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; rwlock_t hash_lock; }; /* * PPPoE could be in the following stages: * 1) Discovery stage (to obtain remote MAC and Session ID) * 2) Session stage (MAC and SID are known) * * Ethernet frames have a special tag for this but * we use simpler approach based on session id */ static inline bool stage_session(__be16 sid) { return sid != 0; } static inline struct pppoe_net *pppoe_pernet(struct net *net) { return net_generic(net, pppoe_net_id); } static inline int cmp_2_addr(struct pppoe_addr *a, struct pppoe_addr *b) { return a->sid == b->sid && ether_addr_equal(a->remote, b->remote); } static inline int cmp_addr(struct pppoe_addr *a, __be16 sid, char *addr) { return a->sid == sid && ether_addr_equal(a->remote, addr); } #if 8 % PPPOE_HASH_BITS #error 8 must be a multiple of PPPOE_HASH_BITS #endif static int hash_item(__be16 sid, unsigned char *addr) { unsigned char hash = 0; unsigned int i; for (i = 0; i < ETH_ALEN; i++) hash ^= addr[i]; for (i = 0; i < sizeof(sid_t) * 8; i += 8) hash ^= (__force __u32)sid >> i; for (i = 8; (i >>= 1) >= PPPOE_HASH_BITS;) hash ^= hash >> i; return hash & PPPOE_HASH_MASK; } /********************************************************************** * * Set/get/delete/rehash items (internal versions) * **********************************************************************/ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; ret = ret->next; } return NULL; } static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; ret = ret->next; } po->next = pn->hash_table[hash]; pn->hash_table[hash] = po; return 0; } static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret, **src; ret = pn->hash_table[hash]; src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { *src = ret->next; break; } src = &ret->next; ret = ret->next; } } /********************************************************************** * * Set/get/delete/rehash items * **********************************************************************/ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { struct pppox_sock *po; read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); if (po) sock_hold(sk_pppox(po)); read_unlock_bh(&pn->hash_lock); return po; } static inline struct pppox_sock *get_item_by_addr(struct net *net, struct sockaddr_pppox *sp) { struct net_device *dev; struct pppoe_net *pn; struct pppox_sock *pppox_sock = NULL; int ifindex; rcu_read_lock(); dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); if (dev) { ifindex = dev->ifindex; pn = pppoe_pernet(net); pppox_sock = get_item(pn, sp->sa_addr.pppoe.sid, sp->sa_addr.pppoe.remote, ifindex); } rcu_read_unlock(); return pppox_sock; } static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { write_lock_bh(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); write_unlock_bh(&pn->hash_lock); } /*************************************************************************** * * Handler for device events. * Certain device events require that sockets be unconnected. * **************************************************************************/ static void pppoe_flush_dev(struct net_device *dev) { struct pppoe_net *pn; int i; pn = pppoe_pernet(dev_net(dev)); write_lock_bh(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { struct pppox_sock *po = pn->hash_table[i]; struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { po = po->next; } if (!po) break; sk = sk_pppox(po); /* We always grab the socket lock, followed by the * hash_lock, in that order. Since we should hold the * sock lock while doing any unbinding, we need to * release the lock we're holding. Hold a reference to * the sock so it doesn't disappear as we're jumping * between locks. */ sock_hold(sk); write_unlock_bh(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { pppox_unbind_sock(sk); sk->sk_state_change(sk); po->pppoe_dev = NULL; dev_put(dev); } release_sock(sk); sock_put(sk); /* Restart the process from the start of the current * hash chain. We dropped locks so the world may have * change from underneath us. */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); write_lock_bh(&pn->hash_lock); po = pn->hash_table[i]; } } write_unlock_bh(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Only look at sockets that are using this specific device. */ switch (event) { case NETDEV_CHANGEADDR: case NETDEV_CHANGEMTU: /* A change in mtu or address is a bad thing, requiring * LCP re-negotiation. */ case NETDEV_GOING_DOWN: case NETDEV_DOWN: /* Find every socket on this device and kill it. */ pppoe_flush_dev(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block pppoe_notifier = { .notifier_call = pppoe_device_event, }; /************************************************************************ * * Do the real work of receiving a PPPoE Session frame. * ***********************************************************************/ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state * can't change. */ if (skb->pkt_type == PACKET_OTHERHOST) goto abort_kfree; if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); } else if (sk->sk_state & PPPOX_RELAY) { relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (relay_po == NULL) goto abort_kfree; if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) goto abort_put; if (!__pppoe_xmit(sk_pppox(relay_po), skb)) goto abort_put; sock_put(sk_pppox(relay_po)); } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; } return NET_RX_SUCCESS; abort_put: sock_put(sk_pppox(relay_po)); abort_kfree: kfree_skb(skb); return NET_RX_DROP; } /************************************************************************ * * Receive wrapper called in BH context. * ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; int len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb_mac_header_len(skb) < ETH_HLEN) goto drop; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; ph = pppoe_hdr(skb); len = ntohs(ph->length); skb_pull_rcsum(skb, sizeof(*ph)); if (skb->len < len) goto drop; if (pskb_trim_rcsum(skb, len)) goto drop; ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) * is known to be safe. */ po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (!po) goto drop; return sk_receive_skb(sk_pppox(po), skb, 0); drop: kfree_skb(skb); out: return NET_RX_DROP; } static void pppoe_unbind_sock_work(struct work_struct *work) { struct pppox_sock *po = container_of(work, struct pppox_sock, proto.pppoe.padt_work); struct sock *sk = sk_pppox(po); lock_sock(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); release_sock(sk); sock_put(sk); } /************************************************************************ * * Receive a PPPoE Discovery frame. * This is solely for detection of PADT frames * ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb->pkt_type != PACKET_HOST) goto abort; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto abort; ph = pppoe_hdr(skb); if (ph->code != PADT_CODE) goto abort; pn = pppoe_pernet(dev_net(dev)); po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (po) if (!schedule_work(&po->proto.pppoe.padt_work)) sock_put(sk_pppox(po)); abort: kfree_skb(skb); out: return NET_RX_SUCCESS; /* Lies... :-) */ } static struct packet_type pppoes_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_SES), .func = pppoe_rcv, }; static struct packet_type pppoed_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_DISC), .func = pppoe_disc_rcv, }; static struct proto pppoe_sk_proto __read_mostly = { .name = "PPPOE", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; /*********************************************************************** * * Initialize a new struct sock. * **********************************************************************/ static int pppoe_create(struct net *net, struct socket *sock, int kern) { struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_OE; INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work, pppoe_unbind_sock_work); return 0; } static int pppoe_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; struct pppoe_net *pn; struct net *net = NULL; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; net = sock_net(sk); pn = pppoe_pernet(net); /* * protect "po" from concurrent updates * on pppoe_flush_dev */ delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = NULL; struct pppoe_net *pn; struct net *net = NULL; int error; lock_sock(sk); error = -EINVAL; if (sockaddr_len != sizeof(struct sockaddr_pppox)) goto end; if (sp->sa_protocol != PX_PROTO_OE) goto end; /* Check for already bound sockets */ error = -EBUSY; if ((sk->sk_state & PPPOX_CONNECTED) && stage_session(sp->sa_addr.pppoe.sid)) goto end; /* Check for already disconnected sockets, on attempts to disconnect */ error = -EALREADY; if ((sk->sk_state & PPPOX_DEAD) && !stage_session(sp->sa_addr.pppoe.sid)) goto end; error = 0; /* Delete the old binding */ if (stage_session(po->pppoe_pa.sid)) { pppox_unbind_sock(sk); pn = pppoe_pernet(sock_net(sk)); delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; sk->sk_state = PPPOX_NONE; } /* Re-bind in session stage only */ if (stage_session(sp->sa_addr.pppoe.sid)) { error = -ENODEV; net = sock_net(sk); dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev); if (!dev) goto err_put; po->pppoe_dev = dev; po->pppoe_ifindex = dev->ifindex; pn = pppoe_pernet(net); if (!(dev->flags & IFF_UP)) { goto err_put; } memcpy(&po->pppoe_pa, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); write_lock_bh(&pn->hash_lock); error = __set_item(pn, po); write_unlock_bh(&pn->hash_lock); if (error < 0) goto err_put; po->chan.hdrlen = (sizeof(struct pppoe_hdr) + dev->hard_header_len); po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2; po->chan.private = sk; po->chan.ops = &pppoe_chan_ops; error = ppp_register_net_channel(dev_net(dev), &po->chan); if (error) { delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); goto err_put; } sk->sk_state = PPPOX_CONNECTED; } po->num = sp->sa_addr.pppoe.sid; end: release_sock(sk); return error; err_put: if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } goto end; } static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OE; memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa, sizeof(struct pppoe_addr)); memcpy(uaddr, &sp, len); return len; } static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int val; int err; switch (cmd) { case PPPIOCGMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (put_user(po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN, (int __user *)arg)) break; err = 0; break; case PPPIOCSMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (get_user(val, (int __user *)arg)) break; if (val < (po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN)) err = 0; else err = -EINVAL; break; case PPPIOCSFLAGS: err = -EFAULT; if (get_user(val, (int __user *)arg)) break; err = 0; break; case PPPOEIOCSFWD: { struct pppox_sock *relay_po; err = -EBUSY; if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) break; err = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; /* PPPoE address from the user specifies an outbound PPPoE address which frames are forwarded to */ err = -EFAULT; if (copy_from_user(&po->pppoe_relay, (void __user *)arg, sizeof(struct sockaddr_pppox))) break; err = -EINVAL; if (po->pppoe_relay.sa_family != AF_PPPOX || po->pppoe_relay.sa_protocol != PX_PROTO_OE) break; /* Check that the socket referenced by the address actually exists. */ relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (!relay_po) break; sock_put(sk_pppox(relay_po)); sk->sk_state |= PPPOX_RELAY; err = 0; break; } case PPPOEIOCDFWD: err = -EALREADY; if (!(sk->sk_state & PPPOX_RELAY)) break; sk->sk_state &= ~PPPOX_RELAY; err = 0; break; default: err = -ENOTTY; } return err; } static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sk_buff *skb; struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int error; struct pppoe_hdr hdr; struct pppoe_hdr *ph; struct net_device *dev; char *start; int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { error = -ENOTCONN; goto end; } hdr.ver = 1; hdr.type = 1; hdr.code = 0; hdr.sid = po->num; dev = po->pppoe_dev; error = -EMSGSIZE; if (total_len > (dev->mtu + dev->hard_header_len)) goto end; hlen = LL_RESERVED_SPACE(dev); skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr)); start = (char *)&ph->tag[0]; error = memcpy_from_msg(start, m, total_len); if (error < 0) { kfree_skb(skb); goto end; } error = total_len; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, total_len); memcpy(ph, &hdr, sizeof(struct pppoe_hdr)); ph->length = htons(total_len); dev_queue_xmit(skb); end: release_sock(sk); return error; } /************************************************************************ * * xmit function for internal use. * ***********************************************************************/ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; struct pppoe_hdr *ph; int data_len = skb->len; /* The higher-level PPP code (ppp_unregister_channel()) ensures the PPP * xmit operations conclude prior to an unregistration call. Thus * sk->sk_state cannot change, so we don't need to do lock_sock(). * But, we also can't do a lock_sock since that introduces a potential * deadlock as we'd reverse the lock ordering used when calling * ppp_unregister_channel(). */ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; if (!dev) goto abort; /* Copy the data if there is no space for the header or if it's * read-only. */ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); skb_reset_network_header(skb); ph = pppoe_hdr(skb); ph->ver = 1; ph->type = 1; ph->code = 0; ph->sid = po->num; ph->length = htons(data_len); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); skb->dev = dev; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, data_len); dev_queue_xmit(skb); return 1; abort: kfree_skb(skb); return 1; } /************************************************************************ * * xmit function called by generic PPP driver * sends PPP frame over PPPoE socket * ***********************************************************************/ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = chan->private; return __pppoe_xmit(sk, skb); } static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path, const struct ppp_channel *chan) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED) || !dev) return -1; path->type = DEV_PATH_PPPOE; path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; return 0; } static const struct ppp_channel_ops pppoe_chan_ops = { .start_xmit = pppoe_xmit, .fill_forward_path = pppoe_fill_forward_path, }; static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; if (sk->sk_state & PPPOX_BOUND) { error = -EIO; goto end; } skb = skb_recv_datagram(sk, flags, &error); if (error < 0) goto end; if (skb) { total_len = min_t(size_t, total_len, skb->len); error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; } } kfree_skb(skb); end: return error; } #ifdef CONFIG_PROC_FS static int pppoe_seq_show(struct seq_file *seq, void *v) { struct pppox_sock *po; char *dev_name; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Id Address Device\n"); goto out; } po = v; dev_name = po->pppoe_pa.dev; seq_printf(seq, "%08X %pM %8s\n", po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name); out: return 0; } static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) { struct pppox_sock *po; int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { po = pn->hash_table[i]; while (po) { if (!pos--) goto out; po = po->next; } } out: return po; } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) __acquires(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; read_lock_bh(&pn->hash_lock); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); struct pppox_sock *po; ++*pos; if (v == SEQ_START_TOKEN) { po = pppoe_get_idx(pn, 0); goto out; } po = v; if (po->next) po = po->next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) break; } } out: return po; } static void pppoe_seq_stop(struct seq_file *seq, void *v) __releases(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); read_unlock_bh(&pn->hash_lock); } static const struct seq_operations pppoe_seq_ops = { .start = pppoe_seq_start, .next = pppoe_seq_next, .stop = pppoe_seq_stop, .show = pppoe_seq_show, }; #endif /* CONFIG_PROC_FS */ static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, .bind = sock_no_bind, .connect = pppoe_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppoe_proto = { .create = pppoe_create, .ioctl = pppoe_ioctl, .owner = THIS_MODULE, }; static __net_init int pppoe_init_net(struct net *net) { struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; rwlock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); #ifdef CONFIG_PROC_FS if (!pde) return -ENOMEM; #endif return 0; } static __net_exit void pppoe_exit_net(struct net *net) { remove_proc_entry("pppoe", net->proc_net); } static struct pernet_operations pppoe_net_ops = { .init = pppoe_init_net, .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), }; static int __init pppoe_init(void) { int err; err = register_pernet_device(&pppoe_net_ops); if (err) goto out; err = proto_register(&pppoe_sk_proto, 0); if (err) goto out_unregister_net_ops; err = register_pppox_proto(PX_PROTO_OE, &pppoe_proto); if (err) goto out_unregister_pppoe_proto; dev_add_pack(&pppoes_ptype); dev_add_pack(&pppoed_ptype); register_netdevice_notifier(&pppoe_notifier); return 0; out_unregister_pppoe_proto: proto_unregister(&pppoe_sk_proto); out_unregister_net_ops: unregister_pernet_device(&pppoe_net_ops); out: return err; } static void __exit pppoe_exit(void) { unregister_netdevice_notifier(&pppoe_notifier); dev_remove_pack(&pppoed_ptype); dev_remove_pack(&pppoes_ptype); unregister_pppox_proto(PX_PROTO_OE); proto_unregister(&pppoe_sk_proto); unregister_pernet_device(&pppoe_net_ops); } module_init(pppoe_init); module_exit(pppoe_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OE); |
| 2225 1148 1148 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_TYPES_H #define _LINUX_MM_TYPES_H #include <linux/mm_types_task.h> #include <linux/auxvec.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> #include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <asm/mmu.h> #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) #define INIT_PASID 0 struct address_space; struct mem_cgroup; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * * If you allocate the page using alloc_pages(), you can use some of the * space in struct page for your own purposes. The five words in the main * union are available, except for bit 0 of the first word which must be * kept clear. Many users use this word to store a pointer to an object * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * * If your page will not be mapped to userspace, you can also use the four * bytes in the mapcount union, but you must call page_mapcount_reset() * before freeing it. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the * refcount does not cause problems. On receiving the page from * alloc_pages(), the refcount will be positive. * * If you allocate pages of order > 0, you can use some of the fields * in each subpage, but you may need to restore some of their values * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and counters. * That requires that freelist & counters in struct slab be adjacent and * double-word aligned. Because struct slab currently just reinterprets the * bits of struct page, we align all struct pages to double-word boundaries, * and ensure that 'freelist' is aligned within struct slab. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else #define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ union { struct list_head lru; /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ void *__filler; /* Count page's or folio's mlocks */ unsigned int mlock_count; }; /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; union { pgoff_t index; /* Our offset within mapping. */ unsigned long share; /* share count for fsdax */ }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; }; struct { /* page_pool used by netstack */ /** * @pp_magic: magic value to avoid recycling non * page_pool allocated pages. */ unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being * mapped so the next 3 words hold the mapping, index, * and private fields from the source anonymous or * page cache page while the page is migrated to device * private memory. * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also * use the mapping, index, and private fields when * pmem backed DAX files are mapped. */ }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; }; union { /* This union is 4 bytes in size. */ /* * If the page can be mapped to userspace, encodes the number * of times this page is referenced by a page table. */ atomic_t _mapcount; /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page * is used for. See page-flags.h for a list of page types * which are currently stored here. */ unsigned int page_type; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif #ifdef CONFIG_KMSAN /* * KMSAN metadata for this page: * - shadow page: every bit indicates whether the corresponding * bit of the original page is initialized (0) or not (1); * - origin page: every 4 bytes contain an id of the stack trace * where the uninitialized value was created. */ struct page *kmsan_shadow; struct page *kmsan_origin; #endif } _struct_page_alignment; /* * struct encoded_page - a nonexistent type marking this pointer * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent * information. Not super-common, but happens in mmu_gather and mlock * handling, and this acts as a type system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) * for more. * * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; #define ENCODE_PAGE_BITS 3ul static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { return ENCODE_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); } /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. */ typedef struct { unsigned long val; } swp_entry_t; /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. * @lru: Least Recently Used list; tracks how recently this folio was used. * @mlock_count: Number of times this folio has been pinned by mlock(). * @mapping: The file this page belongs to, or refers to the anon_vma for * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to * find out how many times this folio is mapped by userspace. * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that * same power-of-two. It is at least as large as %PAGE_SIZE. If it is * in the page cache, it is at a file offset which is a multiple of that * power-of-two. It may be mapped into userspace at an address which is * at an arbitrary page offset, but its kernel virtual address is aligned * to its size. */ struct folio { /* private: don't document the anon union */ union { struct { /* public: */ unsigned long flags; union { struct list_head lru; /* private: avoid cluttering the output */ struct { void *__filler; /* public: */ unsigned int mlock_count; /* private: */ }; /* public: */ }; struct address_space *mapping; pgoff_t index; union { void *private; swp_entry_t swap; }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif #if defined(WANT_PAGE_VIRTUAL) void *virtual; #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif /* private: the union with struct page is transitional */ }; struct page page; }; union { struct { unsigned long _flags_1; unsigned long _head_1; unsigned long _folio_avail; /* public: */ atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif /* private: the union with struct page is transitional */ }; struct page __page_1; }; union { struct { unsigned long _flags_2; unsigned long _head_2; /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; struct { unsigned long _flags_2a; unsigned long _head_2a; /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; struct page __page_2; }; }; #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); FOLIO_MATCH(_refcount, _refcount); #ifdef CONFIG_MEMCG FOLIO_MATCH(memcg_data, memcg_data); #endif #if defined(WANT_PAGE_VIRTUAL) FOLIO_MATCH(virtual, virtual); #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS FOLIO_MATCH(_last_cpupid, _last_cpupid); #endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); #undef FOLIO_MATCH /** * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Unused for page tables. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 and x86. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good * understanding of the issues. */ struct ptdesc { unsigned long __page_flags; union { struct rcu_head pt_rcu_head; struct list_head pt_list; struct { unsigned long _pt_pad_1; pgtable_t pmd_huge_pte; }; }; unsigned long __page_mapping; union { struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; union { unsigned long _pt_pad_2; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif }; #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif #undef TABLE_MATCH static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); #define ptdesc_page(pt) (_Generic((pt), \ const struct ptdesc *: (const struct page *)(pt), \ struct ptdesc *: (struct page *)(pt))) #define ptdesc_folio(pt) (_Generic((pt), \ const struct ptdesc *: (const struct folio *)(pt), \ struct ptdesc *: (struct folio *)(pt))) #define page_ptdesc(p) (_Generic((p), \ const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) /* * Used for sizing the vmemmap region on some architectures */ #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) /* * page_private can be used on tail pages. However, PagePrivate is only * checked by the VM on the head page. So page_private on the tail pages * should be used for data that's ancillary to the head page (eg attaching * buffer heads to tail pages after attaching buffer heads to the head page) */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) { page->private = private; } static inline void *folio_get_private(struct folio *folio) { return folio->private; } struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) __u16 offset; __u16 size; #else __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; }; typedef unsigned long vm_flags_t; /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that * map parts of them. */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ vm_flags_t vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ char name[]; }; #ifdef CONFIG_ANON_VMA_NAME /* * mmap_lock should be read-locked when calling anon_vma_name(). Caller should * either keep holding the lock while using the returned pointer or it should * raise anon_vma_name refcount before releasing the lock. */ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); struct anon_vma_name *anon_vma_name_alloc(const char *name); void anon_vma_name_free(struct kref *kref); #else /* CONFIG_ANON_VMA_NAME */ static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) { return NULL; } #endif struct vma_lock { struct rw_semaphore lock; }; struct vma_numab_state { /* * Initialised as time in 'jiffies' after which VMA * should be scanned. Delays first scan of new VMA by at * least sysctl_numa_balancing_scan_delay: */ unsigned long next_scan; /* * Time in jiffies when pids_active[] is reset to * detect phase change behaviour: */ unsigned long pids_active_reset; /* * Approximate tracking of PIDs that trapped a NUMA hinting * fault. May produce false positives due to hash collisions. * * [0] Previous PID tracking * [1] Current PID tracking * * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; /* MM scan sequence ID when scan first started after VMA creation */ int start_scan_seq; /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq */ int prev_scan_seq; }; /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ union { struct { /* VMA covers [vm_start; vm_end) addresses within mm */ unsigned long vm_start; unsigned long vm_end; }; #ifdef CONFIG_PER_VMA_LOCK struct rcu_head vm_rcu; /* Used for deferred freeing. */ #endif }; struct mm_struct *vm_mm; /* The address space we belong to. */ pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. */ union { const vm_flags_t vm_flags; vm_flags_t __private __vm_flags; }; #ifdef CONFIG_PER_VMA_LOCK /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) * - vm_lock->lock (in write mode) * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) * - vm_lock->lock (in read or write mode) * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * * This sequence counter is explicitly allowed to overflow; sequence * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ int vm_lock_seq; struct vma_lock *vm_lock; /* Flag to indicate areas detached from the mm->mm_mt tree */ bool detached; #endif /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * */ struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ struct list_head anon_vma_chain; /* Serialized by mmap_lock & * page_table_lock */ struct anon_vma *anon_vma; /* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ #ifdef CONFIG_ANON_VMA_NAME /* * For private and shared anonymous mappings, a pointer to a null * terminated string containing the name given to the vma, or NULL if * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. */ struct anon_vma_name *anon_name; #endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else #define vma_policy(vma) NULL #endif #ifdef CONFIG_SCHED_MM_CID struct mm_cid { u64 time; int cid; }; #endif struct kioctx_table; struct mm_struct { struct { /* * Fields which are often written to are placed in a separate * cache line. */ struct { /** * @mm_count: The number of references to &struct * mm_struct (@mm_users count as 1). * * Use mmgrab()/mmdrop() to modify. When this drops to * 0, the &struct mm_struct is freed. */ atomic_t mm_count; } ____cacheline_aligned_in_smp; struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* Base addresses for compatible mmap() */ unsigned long mmap_compat_base; unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER /** * @membarrier_state: Flags controlling membarrier behavior. * * This field is close to @pgd to hopefully fit in the same * cache-line, which needs to be touched by switch_mm(). */ atomic_t membarrier_state; #endif /** * @mm_users: The number of users including userspace. * * Use mmget()/mmget_not_zero()/mmput() to modify. When this * drops to 0 (i.e. when the task exits and there are no other * temporary reference holders), we also release a reference on * @mm_count (which may then free the &struct mm_struct if * @mm_count also drops to 0). */ atomic_t mm_users; #ifdef CONFIG_SCHED_MM_CID /** * @pcpu_cid: Per-cpu current cid. * * Keep track of the currently allocated mm_cid for each cpu. * The per-cpu mm_cid values are serialized by their respective * runqueue locks. */ struct mm_cid __percpu *pcpu_cid; /* * @mm_cid_next_scan: Next mm_cid scan (in jiffies). * * When the next mm_cid scan is due (in jiffies). */ unsigned long mm_cid_next_scan; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some * counters */ /* * With some kernel config, the current mmap_lock's offset * inside 'mm_struct' is at 0x120, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * * So please be careful with adding new fields before * mmap_lock, which can easily push the 2 fields into one * cacheline. */ struct rw_semaphore mmap_lock; struct list_head mmlist; /* List of maybe swapped mm's. These * are globally strung together off * init_mm.mmlist, and are protected * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. * * Can be modified under write mmap_lock using RELEASE * semantics. * Can be read with no other protection when holding write * mmap_lock. * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ int mm_lock_seq; #endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; /** * @write_protect_seq: Locked when any thread is write * protecting pages mapped by this mm to enforce a later COW, * for instance during page table copying for fork(). */ seqcount_t write_protect_seq; spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; /* Architecture-specific MM context */ mm_context_t context; unsigned long flags; /* Must use atomic bitops to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ struct task_struct __rcu *owner; #endif struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that PTEs will be remapped * PROT_NONE to trigger NUMA hinting faults; such faults gather * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT struct rcu_head delayed_drop; #endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; #ifdef CONFIG_IOMMU_SVA u32 pasid; #endif #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* * Represent how many pages are checked for ksm merging * including merged and not merged. */ unsigned long ksm_rmap_items; /* * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; /* * Set when switching to this mm_struct, as a hint of * whether it has been used since the last time per-node * page table walkers cleared the corresponding bits. */ unsigned long bitmap; #ifdef CONFIG_MEMCG /* points to the memcg of "owner" above */ struct mem_cgroup *memcg; #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ unsigned long cpu_bitmap[]; }; #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { return (struct cpumask *)&mm->cpu_bitmap; } #ifdef CONFIG_LRU_GEN struct lru_gen_mm_list { /* mm_struct list for page table walkers */ struct list_head fifo; /* protects the list above */ spinlock_t lock; }; #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_LRU_GEN_WALKS_MMU void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); void lru_gen_migrate_mm(struct mm_struct *mm); static inline void lru_gen_init_mm(struct mm_struct *mm) { INIT_LIST_HEAD(&mm->lru_gen.list); mm->lru_gen.bitmap = 0; #ifdef CONFIG_MEMCG mm->lru_gen.memcg = NULL; #endif } static inline void lru_gen_use_mm(struct mm_struct *mm) { /* * When the bitmap is set, page reclaim knows this mm_struct has been * used since the last time it cleared the bitmap. So it might be worth * walking the page tables of this mm_struct to clear the accessed bit. */ WRITE_ONCE(mm->lru_gen.bitmap, -1); } #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { } static inline void lru_gen_del_mm(struct mm_struct *mm) { } static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } static inline void lru_gen_init_mm(struct mm_struct *mm) { } static inline void lru_gen_use_mm(struct mm_struct *mm) { } #endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; }; #define VMA_ITERATOR(name, __mm, __addr) \ struct vma_iterator name = { \ .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ .node = NULL, \ .status = ma_start, \ }, \ } static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { mas_init(&vmi->mas, &mm->mm_mt, addr); } #ifdef CONFIG_SCHED_MM_CID enum mm_cid_state { MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ MM_CID_LAZY_PUT = (1U << 31), }; static inline bool mm_cid_is_unset(int cid) { return cid == MM_CID_UNSET; } static inline bool mm_cid_is_lazy_put(int cid) { return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); } static inline bool mm_cid_is_valid(int cid) { return !(cid & MM_CID_LAZY_PUT); } static inline int mm_cid_set_lazy_put(int cid) { return cid | MM_CID_LAZY_PUT; } static inline int mm_cid_clear_lazy_put(int cid) { return cid & ~MM_CID_LAZY_PUT; } /* Accessor for struct mm_struct's cidmask. */ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm; cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); /* Skip cpu_bitmap */ cid_bitmap += cpumask_size(); return (struct cpumask *)cid_bitmap; } static inline void mm_init_cid(struct mm_struct *mm) { int i; for_each_possible_cpu(i) { struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; pcpu_cid->time = 0; } cpumask_clear(mm_cidmask(mm)); } static inline int mm_alloc_cid(struct mm_struct *mm) { mm->pcpu_cid = alloc_percpu(struct mm_cid); if (!mm->pcpu_cid) return -ENOMEM; mm_init_cid(mm); return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { free_percpu(mm->pcpu_cid); mm->pcpu_cid = NULL; } static inline unsigned int mm_cid_size(void) { return cpumask_size(); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm) { } static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } static inline unsigned int mm_cid_size(void) { return 0; } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; /** * typedef vm_fault_t - Return type for page fault handlers. * * Page fault handlers return a bitmask of %VM_FAULT values. */ typedef __bitwise unsigned int vm_fault_t; /** * enum vm_fault_reason - Page fault handlers return a bitmask of * these values to tell the core VM what happened when handling the * fault. Used to decide whether a process gets delivered SIGBUS or * just gets major/minor fault counters bumped up. * * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits * @VM_FAULT_SIGSEGV: segmentation fault * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page * @VM_FAULT_LOCKED: ->fault locked the returned page * @VM_FAULT_RETRY: ->fault blocked, must retry * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small * @VM_FAULT_DONE_COW: ->fault has fully handled COW * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, VM_FAULT_RETRY = (__force vm_fault_t)0x000400, VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) #define VM_FAULT_RESULT_TRACE \ { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ { VM_FAULT_NOPAGE, "NOPAGE" }, \ { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ { VM_FAULT_COMPLETED, "COMPLETED" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ /* * If .fault is not provided, this points to a * NULL-terminated array of pages that back the special mapping. * * This must not be NULL unless .fault is provided. */ struct page **pages; /* * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ vm_fault_t (*fault)(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); }; enum tlb_flush_reason { TLB_FLUSH_ON_TASK_SWITCH, TLB_REMOTE_SHOOTDOWN, TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, NR_TLB_FLUSH_REASONS, }; /** * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. * @FAULT_FLAG_TRIED: The fault has been tried once. * @FAULT_FLAG_USER: The fault originated in userspace. * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a * COW mapping, making sure that an exclusive anon page is * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two * fault flags correctly. Currently there can be three legal combinations: * * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and * this is the first try * * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and * we've already tried at least once * * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry * * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never * be used. Note that page faults can be allowed to retry for multiple times, * in which case we'll have an initial fault with flags (a) then later on * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, FAULT_FLAG_MKWRITE = 1 << 1, FAULT_FLAG_ALLOW_RETRY = 1 << 2, FAULT_FLAG_RETRY_NOWAIT = 1 << 3, FAULT_FLAG_KILLABLE = 1 << 4, FAULT_FLAG_TRIED = 1 << 5, FAULT_FLAG_USER = 1 << 6, FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, FAULT_FLAG_VMA_LOCK = 1 << 12, }; typedef unsigned int __bitwise zap_flags_t; /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: * * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm * users like RDMA and V4L2 only establish mappings which coordinate usage with * the filesystem. Ideas for this coordination include revoking the longterm * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was * added after the problem with filesystems was found FS DAX VMAs are * specifically failed. Filesystem pages are still subject to bugs and use of * FOLL_LONGTERM should be avoided on those pages. * * In the CMA case: long term pins in a CMA region would unnecessarily fragment * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. * * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, * but an additional pin counting system) will be invoked. This is intended for * anything that gets a page reference and then touches page data (for example, * Direct IO). This lets the filesystem know that some non-file-system entity is * potentially changing the pages' data. In contrast to FOLL_GET (whose pages * are released via put_page()), FOLL_PIN pages must be released, ultimately, by * a call to unpin_user_page(). * * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different * and separate refcounting mechanisms, however, and that means that each has * its own acquire and release mechanisms: * * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. * * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. * * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based * calls applied to them, and that's perfectly OK. This is a constraint on the * callers, not on the pages.) * * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never * directly by the caller. That's in order to help avoid mismatches when * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * * Please see Documentation/core-api/pin_user_pages.rst for more information. */ enum { /* check pte is writable */ FOLL_WRITE = 1 << 0, /* do get_page on page */ FOLL_GET = 1 << 1, /* give error on hole if it would be zero */ FOLL_DUMP = 1 << 2, /* get_user_pages read/write w/o permission */ FOLL_FORCE = 1 << 3, /* * if a disk transfer is needed, start the IO and return without waiting * upon it */ FOLL_NOWAIT = 1 << 4, /* do not fault in pages */ FOLL_NOFAULT = 1 << 5, /* check page is hwpoisoned */ FOLL_HWPOISON = 1 << 6, /* don't do file mappings */ FOLL_ANON = 1 << 7, /* * FOLL_LONGTERM indicates that the page will be held for an indefinite * time period _often_ under userspace control. This is in contrast to * iov_iter_get_pages(), whose usages are transient. */ FOLL_LONGTERM = 1 << 8, /* split huge pmd before returning */ FOLL_SPLIT_PMD = 1 << 9, /* allow returning PCI P2PDMA pages */ FOLL_PCI_P2PDMA = 1 << 10, /* allow interrupts from generic signals */ FOLL_INTERRUPTIBLE = 1 << 11, /* * Always honor (trigger) NUMA hinting faults. * * FOLL_WRITE implicitly honors NUMA hinting faults because a * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE * apply). get_user_pages_fast_only() always implicitly honors NUMA * hinting faults. */ FOLL_HONOR_NUMA_FAULT = 1 << 12, /* See also internal only FOLL flags in mm/internal.h */ }; #endif /* _LINUX_MM_TYPES_H */ |
| 1861 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H #define __HSR_SLAVE_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include "hsr_main.h" int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type pt, struct netlink_ext_ack *extack); void hsr_del_port(struct hsr_port *port); bool hsr_port_exists(const struct net_device *dev); static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) { ASSERT_RTNL(); return hsr_port_exists(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) { return hsr_port_exists(dev) ? rcu_dereference(dev->rx_handler_data) : NULL; } bool hsr_invalid_dan_ingress_frame(__be16 protocol); #endif /* __HSR_SLAVE_H */ |
| 4417 2343 4318 4419 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 KFENCE support. * * Copyright (C) 2020, Google LLC. */ #ifndef _ASM_X86_KFENCE_H #define _ASM_X86_KFENCE_H #ifndef MODULE #include <linux/bug.h> #include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/tlbflush.h> /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { unsigned long addr; for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); addr += PAGE_SIZE) { unsigned int level; if (!lookup_address(addr, &level)) return false; if (level != PG_LEVEL_4K) set_memory_4k(addr, 1); } return true; } /* Protect the given page and flush TLB. */ static inline bool kfence_protect_page(unsigned long addr, bool protect) { unsigned int level; pte_t *pte = lookup_address(addr, &level); if (WARN_ON(!pte || level != PG_LEVEL_4K)) return false; /* * We need to avoid IPIs, as we may get KFENCE allocations or faults * with interrupts disabled. Therefore, the below is best-effort, and * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; * lazy fault handling takes care of faults after the page is PRESENT. */ if (protect) set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); /* * Flush this CPU's TLB, assuming whoever did the allocation/free is * likely to continue running on this CPU. */ preempt_disable(); flush_tlb_one_kernel(addr); preempt_enable(); return true; } #endif /* !MODULE */ #endif /* _ASM_X86_KFENCE_H */ |
| 11 117 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #include <net/xdp.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string(msg, msg) ), TP_fast_assign( __assign_str(msg, msg); ), TP_printk("errmsg=%s", __get_str(msg)) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h> |
| 59 37 59 36 45 4 106 43 106 44 106 6 106 48 106 6 71 312 46 312 313 312 313 200 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(__be16 o_flags) { int addend = 4; if (o_flags & TUNNEL_CSUM) addend += 4; if (o_flags & TUNNEL_KEY) addend += 4; if (o_flags & TUNNEL_SEQ) addend += 4; return addend; } static inline __be16 gre_flags_to_tnl_flags(__be16 flags) { __be16 tflags = 0; if (flags & GRE_CSUM) tflags |= TUNNEL_CSUM; if (flags & GRE_ROUTING) tflags |= TUNNEL_ROUTING; if (flags & GRE_KEY) tflags |= TUNNEL_KEY; if (flags & GRE_SEQ) tflags |= TUNNEL_SEQ; if (flags & GRE_STRICT) tflags |= TUNNEL_STRICT; if (flags & GRE_REC) tflags |= TUNNEL_REC; if (flags & GRE_VERSION) tflags |= TUNNEL_VERSION; return tflags; } static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) { __be16 flags = 0; if (tflags & TUNNEL_CSUM) flags |= GRE_CSUM; if (tflags & TUNNEL_ROUTING) flags |= GRE_ROUTING; if (tflags & TUNNEL_KEY) flags |= GRE_KEY; if (tflags & TUNNEL_SEQ) flags |= GRE_SEQ; if (tflags & TUNNEL_STRICT) flags |= GRE_STRICT; if (tflags & TUNNEL_REC) flags |= GRE_REC; if (tflags & TUNNEL_VERSION) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (flags & TUNNEL_SEQ) { *ptr = seq; ptr--; } if (flags & TUNNEL_KEY) { *ptr = key; ptr--; } if (flags & TUNNEL_CSUM && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif |
| 146 146 146 146 148 148 148 148 5 8 12 8 8 8 5 4 8 3 5 5 5 4 3 146 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 | // SPDX-License-Identifier: GPL-2.0-or-later /* Client connection-specific management code. * * Copyright (C) 2016, 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Client connections need to be cached for a little while after they've made a * call so as to handle retransmitted DATA packets in case the server didn't * receive the final ACK or terminating ABORT we sent it. * * There are flags of relevance to the cache: * * (2) DONT_REUSE - The connection should be discarded as soon as possible and * should not be reused. This is set when an exclusive connection is used * or a call ID counter overflows. * * The caching state may only be changed if the cache lock is held. * * There are two idle client connection expiry durations. If the total number * of connections is below the reap threshold, we use the normal duration; if * it's above, we use the fast duration. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/idr.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include "ar-internal.h" __read_mostly unsigned int rxrpc_reap_client_connections = 900; __read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; static void rxrpc_activate_bundle(struct rxrpc_bundle *bundle) { atomic_inc(&bundle->active); } /* * Release a connection ID for a client connection. */ static void rxrpc_put_client_connection_id(struct rxrpc_local *local, struct rxrpc_connection *conn) { idr_remove(&local->conn_ids, conn->proto.cid >> RXRPC_CIDSHIFT); } /* * Destroy the client connection ID tree. */ static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) { struct rxrpc_connection *conn; int id; if (!idr_is_empty(&local->conn_ids)) { idr_for_each_entry(&local->conn_ids, conn, id) { pr_err("AF_RXRPC: Leaked client conn %p {%d}\n", conn, refcount_read(&conn->ref)); } BUG(); } idr_destroy(&local->conn_ids); } /* * Allocate a connection bundle. */ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { bundle->local = call->local; bundle->peer = rxrpc_get_peer(call->peer, rxrpc_peer_get_bundle); bundle->key = key_get(call->key); bundle->security = call->security; bundle->exclusive = test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); bundle->service_id = call->dest_srx.srx_service; bundle->security_level = call->security_level; bundle->debug_id = atomic_inc_return(&rxrpc_bundle_id); refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); write_lock(&bundle->local->rxnet->conn_lock); list_add_tail(&bundle->proc_link, &bundle->local->rxnet->bundle_proc_list); write_unlock(&bundle->local->rxnet->conn_lock); } return bundle; } struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { int r; __refcount_inc(&bundle->ref, &r); trace_rxrpc_bundle(bundle->debug_id, r + 1, why); return bundle; } static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), rxrpc_bundle_free); write_lock(&bundle->local->rxnet->conn_lock); list_del(&bundle->proc_link); write_unlock(&bundle->local->rxnet->conn_lock); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); } void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { unsigned int id; bool dead; int r; if (bundle) { id = bundle->debug_id; dead = __refcount_dec_and_test(&bundle->ref, &r); trace_rxrpc_bundle(id, r - 1, why); if (dead) rxrpc_free_bundle(bundle); } } /* * Get rid of outstanding client connection preallocations when a local * endpoint is destroyed. */ void rxrpc_purge_client_connections(struct rxrpc_local *local) { rxrpc_destroy_client_conn_ids(local); } /* * Allocate a client connection. */ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; struct rxrpc_local *local = bundle->local; struct rxrpc_net *rxnet = local->rxnet; int id; _enter(""); conn = rxrpc_alloc_connection(rxnet, GFP_ATOMIC | __GFP_NOWARN); if (!conn) return ERR_PTR(-ENOMEM); id = idr_alloc_cyclic(&local->conn_ids, conn, 1, 0x40000000, GFP_ATOMIC | __GFP_NOWARN); if (id < 0) { kfree(conn); return ERR_PTR(id); } refcount_set(&conn->ref, 1); conn->proto.cid = id << RXRPC_CIDSHIFT; conn->proto.epoch = local->rxnet->epoch; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); conn->local = rxrpc_get_local(bundle->local, rxrpc_local_get_client_conn); conn->peer = rxrpc_get_peer(bundle->peer, rxrpc_peer_get_client_conn); conn->key = key_get(bundle->key); conn->security = bundle->security; conn->exclusive = bundle->exclusive; conn->upgrade = bundle->upgrade; conn->orig_service_id = bundle->service_id; conn->security_level = bundle->security_level; conn->state = RXRPC_CONN_CLIENT_UNSECURED; conn->service_id = conn->orig_service_id; if (conn->security == &rxrpc_no_security) conn->state = RXRPC_CONN_CLIENT; atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); rxrpc_see_connection(conn, rxrpc_conn_new_client); atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); return conn; } /* * Determine if a connection may be reused. */ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) { struct rxrpc_net *rxnet; int id_cursor, id, distance, limit; if (!conn) goto dont_reuse; rxnet = conn->rxnet; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; if ((conn->state != RXRPC_CONN_CLIENT_UNSECURED && conn->state != RXRPC_CONN_CLIENT) || conn->proto.epoch != rxnet->epoch) goto mark_dont_reuse; /* The IDR tree gets very expensive on memory if the connection IDs are * widely scattered throughout the number space, so we shall want to * kill off connections that, say, have an ID more than about four * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ id_cursor = idr_get_cursor(&conn->local->conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) distance = -distance; limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; return true; mark_dont_reuse: set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); dont_reuse: return false; } /* * Look up the conn bundle that matches the connection parameters, adding it if * it doesn't yet exist. */ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { struct rxrpc_bundle *bundle, *candidate; struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; long diff; bool upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); _enter("{%px,%x,%u,%u}", call->peer, key_serial(call->key), call->security_level, upgrade); if (test_bit(RXRPC_CALL_EXCLUSIVE, &call->flags)) { call->bundle = rxrpc_alloc_bundle(call, gfp); return call->bundle ? 0 : -ENOMEM; } /* First, see if the bundle is already there. */ _debug("search 1"); spin_lock(&local->client_bundles_lock); p = local->client_bundles.rb_node; while (p) { bundle = rb_entry(p, struct rxrpc_bundle, local_node); #define cmp(X, Y) ((long)(X) - (long)(Y)) diff = (cmp(bundle->peer, call->peer) ?: cmp(bundle->key, call->key) ?: cmp(bundle->security_level, call->security_level) ?: cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) p = p->rb_left; else if (diff > 0) p = p->rb_right; else goto found_bundle; } spin_unlock(&local->client_bundles_lock); _debug("not found"); /* It wasn't. We need to add one. */ candidate = rxrpc_alloc_bundle(call, gfp); if (!candidate) return -ENOMEM; _debug("search 2"); spin_lock(&local->client_bundles_lock); pp = &local->client_bundles.rb_node; parent = NULL; while (*pp) { parent = *pp; bundle = rb_entry(parent, struct rxrpc_bundle, local_node); #define cmp(X, Y) ((long)(X) - (long)(Y)) diff = (cmp(bundle->peer, call->peer) ?: cmp(bundle->key, call->key) ?: cmp(bundle->security_level, call->security_level) ?: cmp(bundle->upgrade, upgrade)); #undef cmp if (diff < 0) pp = &(*pp)->rb_left; else if (diff > 0) pp = &(*pp)->rb_right; else goto found_bundle_free; } _debug("new bundle"); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [new]", call->bundle->debug_id); return 0; found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: call->bundle = rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); rxrpc_activate_bundle(bundle); spin_unlock(&local->client_bundles_lock); _leave(" = B=%u [found]", call->bundle->debug_id); return 0; } /* * Allocate a new connection and add it into a bundle. */ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, unsigned int slot) { struct rxrpc_connection *conn, *old; unsigned int shift = slot * RXRPC_MAXCALLS; unsigned int i; old = bundle->conns[slot]; if (old) { bundle->conns[slot] = NULL; bundle->conn_ids[slot] = 0; trace_rxrpc_client(old, -1, rxrpc_client_replace); rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } conn = rxrpc_alloc_client_connection(bundle); if (IS_ERR(conn)) { bundle->alloc_error = PTR_ERR(conn); return false; } rxrpc_activate_bundle(bundle); conn->bundle_shift = shift; bundle->conns[slot] = conn; bundle->conn_ids[slot] = conn->debug_id; for (i = 0; i < RXRPC_MAXCALLS; i++) set_bit(shift + i, &bundle->avail_chans); return true; } /* * Add a connection to a bundle if there are no usable connections or we have * connections waiting for extra capacity. */ static bool rxrpc_bundle_has_space(struct rxrpc_bundle *bundle) { int slot = -1, i, usable; _enter(""); bundle->alloc_error = 0; /* See if there are any usable connections. */ usable = 0; for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) { if (rxrpc_may_reuse_conn(bundle->conns[i])) usable++; else if (slot == -1) slot = i; } if (!usable && bundle->upgrade) bundle->try_upgrade = true; if (!usable) goto alloc_conn; if (!bundle->avail_chans && !bundle->try_upgrade && usable < ARRAY_SIZE(bundle->conns)) goto alloc_conn; _leave(""); return usable; alloc_conn: return slot >= 0 ? rxrpc_add_conn_to_bundle(bundle, slot) : false; } /* * Assign a channel to the call at the front of the queue and wake the call up. * We don't increment the callNumber counter until this number has been exposed * to the world. */ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, unsigned int channel) { struct rxrpc_channel *chan = &conn->channels[channel]; struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_call *call = list_entry(bundle->waiting_calls.next, struct rxrpc_call, wait_link); u32 call_id = chan->call_counter + 1; _enter("C=%x,%u", conn->debug_id, channel); list_del_init(&call->wait_link); trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); /* Cancel the final ACK on the previous call if it hasn't been sent yet * as the DATA packet will implicitly ACK it. */ clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); rxrpc_see_call(call, rxrpc_call_see_activate_client); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; else call->cong_mode = RXRPC_CALL_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; chan->call = call; rxrpc_see_call(call, rxrpc_call_see_connected); trace_rxrpc_connect_call(call); call->tx_last_sent = ktime_get_real(); rxrpc_start_call_timer(call); rxrpc_set_call_state(call, RXRPC_CALL_CLIENT_SEND_REQUEST); wake_up(&call->waitq); } /* * Remove a connection from the idle list if it's on it. */ static void rxrpc_unidle_conn(struct rxrpc_connection *conn) { if (!list_empty(&conn->cache_link)) { list_del_init(&conn->cache_link); rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } } /* * Assign channels and callNumbers to waiting calls. */ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) { struct rxrpc_connection *conn; unsigned long avail, mask; unsigned int channel, slot; trace_rxrpc_client(NULL, -1, rxrpc_client_activate_chans); if (bundle->try_upgrade) mask = 1; else mask = ULONG_MAX; while (!list_empty(&bundle->waiting_calls)) { avail = bundle->avail_chans & mask; if (!avail) break; channel = __ffs(avail); clear_bit(channel, &bundle->avail_chans); slot = channel / RXRPC_MAXCALLS; conn = bundle->conns[slot]; if (!conn) break; if (bundle->try_upgrade) set_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags); rxrpc_unidle_conn(conn); channel &= (RXRPC_MAXCALLS - 1); conn->act_chans |= 1 << channel; rxrpc_activate_one_channel(conn, channel); } } /* * Connect waiting channels (called from the I/O thread). */ void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; while ((call = list_first_entry_or_null(&local->new_client_calls, struct rxrpc_call, wait_link)) ) { struct rxrpc_bundle *bundle = call->bundle; spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); } } /* * Note that a call, and thus a connection, is about to be exposed to the * world. */ void rxrpc_expose_client_call(struct rxrpc_call *call) { unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { /* Mark the call ID as being used. If the callNumber counter * exceeds ~2 billion, we kill the connection after its * outstanding calls have finished so that the counter doesn't * wrap. */ chan->call_counter++; if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); spin_lock(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); spin_unlock(&call->peer->lock); } } /* * Set the reap timer. */ static void rxrpc_set_client_reap_timer(struct rxrpc_local *local) { if (!local->kill_all_client_conns) { unsigned long now = jiffies; unsigned long reap_at = now + rxrpc_conn_idle_client_expiry; if (local->rxnet->live) timer_reduce(&local->client_conn_reap_timer, reap_at); } } /* * Disconnect a client call. */ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call *call) { struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; struct rxrpc_local *local = bundle->local; unsigned int channel; bool may_reuse; u32 cid; _enter("c=%x", call->debug_id); /* Calls that have never actually been assigned a channel can simply be * discarded. */ conn = call->conn; if (!conn) { _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); list_del_init(&call->wait_link); return; } cid = call->cid; channel = cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); if (WARN_ON(chan->call != call)) return; may_reuse = rxrpc_may_reuse_conn(conn); /* If a client call was exposed to the world, we save the result for * retransmission. * * We use a barrier here so that the call number and abort code can be * read without needing to take a lock. * * TODO: Make the incoming packet handler check this and handle * terminal retransmission without requiring access to the call. */ if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { _debug("exposed %u,%u", call->call_id, call->abort_code); __rxrpc_disconnect_call(conn, call); if (test_and_clear_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) { trace_rxrpc_client(conn, channel, rxrpc_client_to_active); bundle->try_upgrade = false; if (may_reuse) rxrpc_activate_channels(bundle); } } /* See if we can pass the channel directly to another call. */ if (may_reuse && !list_empty(&bundle->waiting_calls)) { trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); rxrpc_activate_one_channel(conn, channel); return; } /* Schedule the final ACK to be transmitted in a short while so that it * can be skipped if we find a follow-on call. The first DATA packet * of the follow on call will implicitly ACK this call. */ if (call->completion == RXRPC_CALL_SUCCEEDED && test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { unsigned long final_ack_at = jiffies + 2; WRITE_ONCE(chan->final_ack_at, final_ack_at); smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); rxrpc_reduce_conn_timer(conn, final_ack_at); } /* Deactivate the channel. */ chan->call = NULL; set_bit(conn->bundle_shift + channel, &conn->bundle->avail_chans); conn->act_chans &= ~(1 << channel); /* If no channels remain active, then put the connection on the idle * list for a short while. Give it a ref to stop it going away if it * becomes unbundled. */ if (!conn->act_chans) { trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; rxrpc_get_connection(conn, rxrpc_conn_get_idle); list_move_tail(&conn->cache_link, &local->idle_client_conns); rxrpc_set_client_reap_timer(local); } } /* * Remove a connection from a bundle. */ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; unsigned int bindex; int i; _enter("C=%x", conn->debug_id); if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, true); bindex = conn->bundle_shift / RXRPC_MAXCALLS; if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; bundle->conn_ids[bindex] = 0; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); rxrpc_put_client_connection_id(bundle->local, conn); rxrpc_deactivate_bundle(bundle); rxrpc_put_connection(conn, rxrpc_conn_put_unbundle); } } /* * Drop the active count on a bundle. */ void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) { struct rxrpc_local *local; bool need_put = false; if (!bundle) return; local = bundle->local; if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { if (!bundle->exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; } spin_unlock(&local->client_bundles_lock); if (need_put) rxrpc_put_bundle(bundle, rxrpc_bundle_put_discard); } } /* * Clean up a dead client connection. */ void rxrpc_kill_client_conn(struct rxrpc_connection *conn) { struct rxrpc_local *local = conn->local; struct rxrpc_net *rxnet = local->rxnet; _enter("C=%x", conn->debug_id); trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); atomic_dec(&rxnet->nr_client_conns); rxrpc_put_client_connection_id(local, conn); } /* * Discard expired client connections from the idle list. Each conn in the * idle list has been exposed and holds an extra ref because of that. * * This may be called from conn setup or from a work item so cannot be * considered non-reentrant. */ void rxrpc_discard_expired_client_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; unsigned long expiry, conn_expires_at, now; unsigned int nr_conns; _enter(""); /* We keep an estimate of what the number of conns ought to be after * we've discarded some so that we don't overdo the discarding. */ nr_conns = atomic_read(&local->rxnet->nr_client_conns); next: conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link); if (!conn) return; if (!local->kill_all_client_conns) { /* If the number of connections is over the reap limit, we * expedite discard by reducing the expiry timeout. We must, * however, have at least a short grace period to be able to do * final-ACK or ABORT retransmission. */ expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; if (conn->local->service_closed) expiry = rxrpc_closed_conn_expiry * HZ; conn_expires_at = conn->idle_timestamp + expiry; now = READ_ONCE(jiffies); if (time_after(conn_expires_at, now)) goto not_yet_expired; } atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); rxrpc_unbundle_conn(conn); /* Drop the ->cache_link ref */ rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle); nr_conns--; goto next; not_yet_expired: /* The connection at the front of the queue hasn't yet expired, so * schedule the work item for that point if we discarded something. * * We don't worry if the work item is already scheduled - it can look * after rescheduling itself at a later time. We could cancel it, but * then things get messier. */ _debug("not yet"); if (!local->kill_all_client_conns) timer_reduce(&local->client_conn_reap_timer, conn_expires_at); _leave(""); } /* * Clean up the client connections on a local endpoint. */ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) { struct rxrpc_connection *conn; _enter(""); local->kill_all_client_conns = true; del_timer_sync(&local->client_conn_reap_timer); while ((conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link))) { list_del_init(&conn->cache_link); atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); rxrpc_unbundle_conn(conn); rxrpc_put_connection(conn, rxrpc_conn_put_local_dead); } _leave(" [culled]"); } |
| 14 19389 22820 130 22825 22846 50 3914 9382 251 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> #include <linux/cpumask.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) #define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) #define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); // Maximum number of unsigned long values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 /** * same_state_synchronize_rcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or * get_completed_synchronize_rcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_LAZY void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); #else static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); #else static inline void rcu_init_tasks_generic(void) { } #endif #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU // Bits for ->trc_reader_special.b.need_qs field. #define TRC_NEED_QS 0x1 // Task needs a quiescent state. #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } static inline int debug_lockdep_rcu_enabled(void) { return 0; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __unrcu_pointer(p, local) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(local)); \ }) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) #define __rcu_access_pointer(p, local, space) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) local = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * Within an RCU read-side critical section, there is little reason to * use rcu_access_pointer(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced * by later inattentive changes. In other words, assigning the * rcu_access_pointer() return value to a local variable results in an * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). However, * please note that starting in v5.0 kernels, vanilla RCU grace periods * wait for local_bh_disable() regions of code in addition to regions of * code demarked by rcu_read_lock() and rcu_read_unlock(). This means * that synchronize_rcu(), call_rcu, and friends all take not only * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). * However, please note that starting in v5.0 kernels, vanilla RCU grace * periods wait for preempt_disable() regions of code in addition to * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). * This means that synchronize_rcu(), call_rcu, and friends all take not * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also * wait for regions of code with preemption disabled, including regions of * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which * define synchronize_sched(), only code enclosed within rcu_read_lock() * and rcu_read_unlock() are guaranteed to be waited for. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. * In recent kernels that have consolidated synchronize_sched() and * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity * also extends to the scheduler's runqueue and priority-inheritance * spinlocks, courtesy of the quiescent-state deferral that is carried * out when rcu_read_unlock() is invoked with interrupts disabled. * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); __release(RCU); __rcu_read_unlock(); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent to rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as an RCU * read-side critical section. However, please note that this equivalence * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent to rcu_read_lock(), but also disables preemption. * Read-side critical sections can also be introduced by anything else that * disables preemption, including local_irq_disable() and friends. However, * please note that the equivalence to rcu_read_lock() applies only to * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kvfree_rcu()? */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. Rather than encoding a * function address in the embedded rcu_head structure, kfree_rcu() instead * encodes the offset of the rcu_head structure within the base structure. * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * The object to be freed can be allocated either by kmalloc() or * kmem_cache_alloc(). * * Note that the allowable offset might decrease in the future. * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) #define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** * kfree_rcu_mightsleep() - kfree an object after a grace period. * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; DEFINE_LOCK_GUARD_0(rcu, rcu_read_lock(), rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */ |
| 6235 4869 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmalloc #if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMALLOC_H #include <linux/tracepoint.h> /** * alloc_vmap_area - called when a new vmap allocation occurs * @addr: an allocated address * @size: a requested size * @align: a requested alignment * @vstart: a requested start range * @vend: a requested end range * @failed: an allocation failed or not * * This event is used for a debug purpose, it can give an extra * information for a developer about how often it occurs and which * parameters are passed for further validation. */ TRACE_EVENT(alloc_vmap_area, TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int failed), TP_ARGS(addr, size, align, vstart, vend, failed), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, size) __field(unsigned long, align) __field(unsigned long, vstart) __field(unsigned long, vend) __field(int, failed) ), TP_fast_assign( __entry->addr = addr; __entry->size = size; __entry->align = align; __entry->vstart = vstart; __entry->vend = vend; __entry->failed = failed; ), TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", __entry->addr, __entry->size, __entry->align, __entry->vstart, __entry->vend, __entry->failed) ); /** * purge_vmap_area_lazy - called when vmap areas were lazily freed * @start: purging start address * @end: purging end address * @npurged: numbed of purged vmap areas * * This event is used for a debug purpose. It gives some * indication about start:end range and how many objects * are released. */ TRACE_EVENT(purge_vmap_area_lazy, TP_PROTO(unsigned long start, unsigned long end, unsigned int npurged), TP_ARGS(start, end, npurged), TP_STRUCT__entry( __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, npurged) ), TP_fast_assign( __entry->start = start; __entry->end = end; __entry->npurged = npurged; ), TP_printk("start=0x%lx end=0x%lx num_purged=%u", __entry->start, __entry->end, __entry->npurged) ); /** * free_vmap_area_noflush - called when a vmap area is freed * @va_start: a start address of VA * @nr_lazy: number of current lazy pages * @nr_lazy_max: number of maximum lazy pages * * This event is used for a debug purpose. It gives some * indication about a VA that is released, number of current * outstanding areas and a maximum allowed threshold before * dropping all of them. */ TRACE_EVENT(free_vmap_area_noflush, TP_PROTO(unsigned long va_start, unsigned long nr_lazy, unsigned long nr_lazy_max), TP_ARGS(va_start, nr_lazy, nr_lazy_max), TP_STRUCT__entry( __field(unsigned long, va_start) __field(unsigned long, nr_lazy) __field(unsigned long, nr_lazy_max) ), TP_fast_assign( __entry->va_start = va_start; __entry->nr_lazy = nr_lazy; __entry->nr_lazy_max = nr_lazy_max; ), TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) ); #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 4142 3911 404 147 2657 4472 6086 4008 1104 4358 1953 2439 1953 574 1050 5789 2229 28 5796 5796 5796 6087 2440 1932 5796 575 575 4127 4128 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/include/linux/jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com> * * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved * * Definitions for transaction data structures for the buffer cache * filesystem journaling support. */ #ifndef _LINUX_JBD2_H #define _LINUX_JBD2_H /* Allow this file to be included directly into e2fsprogs */ #ifndef __KERNEL__ #include "jfs_compat.h" #define JBD2_DEBUG #else #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> #include <linux/stddef.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <crypto/hash.h> #endif #define journal_oom_retry 1 /* * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds * certain classes of error which can occur due to failed IOs. Under * normal use we want ext4 to continue after such errors, because * hardware _can_ fail, but for debugging purposes when running tests on * known-good hardware we may want to trap these errors. */ #undef JBD2_PARANOID_IOFAIL /* * The default maximum commit age, in seconds. */ #define JBD2_DEFAULT_MAX_COMMIT_AGE 5 #ifdef CONFIG_JBD2_DEBUG /* * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal * consistency checks. By default we don't do this unless * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); #define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ /** * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. * * All filesystem modifications made by the process go * through this handle. Recursive operations (such as quota operations) * are gathered into a single update. * * The buffer credits field is used to account for journaled buffers * being modified by the running process. To ensure that there is * enough log space for all outstanding operations, we need to limit the * number of outstanding buffers possible at any time. When the * operation completes, any buffer credits not used are credited back to * the transaction, so that at all times we know how many buffers the * outstanding updates on a transaction might possibly touch. * * This is an opaque datatype. **/ typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. * * journal_t is linked to from the fs superblock structure. * * We use the journal_t to keep track of all outstanding transaction * activity on the filesystem, and to manage the state of the log * writing process. * * This is an opaque datatype. **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif /* * Internal structures used by the logging mechanism: */ #define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ /* * On-disk structures */ /* * Descriptor block types: */ #define JBD2_DESCRIPTOR_BLOCK 1 #define JBD2_COMMIT_BLOCK 2 #define JBD2_SUPERBLOCK_V1 3 #define JBD2_SUPERBLOCK_V2 4 #define JBD2_REVOKE_BLOCK 5 /* * Standard header for all descriptor blocks: */ typedef struct journal_header_s { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; } journal_header_t; /* * Checksum types. */ #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 #define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: * * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* * fields are used to store a checksum of the descriptor and data blocks. * * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum * field is used to store crc32c(uuid+commit_block). Each journal metadata * block gets its own checksum, and data block checksums are stored in * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses * journal_block_tag3_t to store a full 32-bit checksum. Everything else * is the same as v2. * * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; unsigned char h_chksum_type; unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; __be64 h_commit_sec; __be32 h_commit_nsec; }; /* * The block tag: used to describe a single buffer in the journal. * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag3_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ __be32 t_checksum; /* crc32c(uuid+seq+block) */ } journal_block_tag3_t; typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; /* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log */ typedef struct jbd2_journal_revoke_header_s { journal_header_t r_header; __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ #define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ #define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ /* * The journal superblock. All fields are in big-endian byte order. */ typedef struct journal_superblock_s { /* 0x0000 */ journal_header_t s_header; /* 0x000C */ /* Static information describing the journal */ __be32 s_blocksize; /* journal device blocksize */ __be32 s_maxlen; /* total blocks in journal file */ __be32 s_first; /* first block of log information */ /* 0x0018 */ /* Dynamic information describing the current state of the log */ __be32 s_sequence; /* first commit ID expected in log */ __be32 s_start; /* blocknr of start of log */ /* 0x0020 */ /* Error value, as set by jbd2_journal_abort(). */ __be32 s_errno; /* 0x0024 */ /* Remaining fields are only valid in a version-2 superblock */ __be32 s_feature_compat; /* compatible feature set */ __be32 s_feature_incompat; /* incompatible feature set */ __be32 s_feature_ro_compat; /* readonly-compatible feature set */ /* 0x0030 */ __u8 s_uuid[16]; /* 128-bit uuid for journal */ /* 0x0040 */ __be32 s_nr_users; /* Nr of filesystems sharing log */ __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ /* 0x0048 */ __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ __be32 s_head; /* blocknr of head of log, only uptodate * while the filesystem is clean */ /* 0x005C */ __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ /* 0x0400 */ } journal_superblock_t; #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 #define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ #include <linux/fs.h> #include <linux/sched.h> enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ BH_Freed, /* Has been freed (truncated) */ BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) BUFFER_FNS(JWrite, jwrite) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) BUFFER_FNS(Revoked, revoked) TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) BUFFER_FNS(Shadow, shadow) BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { return jh->b_bh; } static inline struct journal_head *bh2jh(struct buffer_head *bh) { return bh->b_private; } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { bit_spin_unlock(BH_JournalHead, &bh->b_state); } #define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) #define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) #else #define __journal_expect(expr, why...) \ ({ \ int val = (expr); \ if (!val) { \ printk(KERN_ERR \ "JBD2 unexpected failure: %s: %s;\n", \ __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ }) #define J_EXPECT(expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 #define __JI_WRITE_DATA 1 #define __JI_WAIT_DATA 2 /* * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) /* Write allocated dirty buffers in this inode before commit */ #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { /** * @i_transaction: * * Which transaction does this inode belong to? Either the running * transaction or the committing one. [j_list_lock] */ transaction_t *i_transaction; /** * @i_next_transaction: * * Pointer to the running transaction modifying inode's data in case * there is already a committing transaction touching it. [j_list_lock] */ transaction_t *i_next_transaction; /** * @i_list: List of inodes in the i_transaction [j_list_lock] */ struct list_head i_list; /** * @i_vfs_inode: * * VFS inode this inode belongs to [constant for lifetime of structure] */ struct inode *i_vfs_inode; /** * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; /** * @i_dirty_start: * * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ loff_t i_dirty_start; /** * @i_dirty_end: * * Inclusive offset in bytes where the dirty range for this inode * ends. [j_list_lock] */ loff_t i_dirty_end; }; struct jbd2_revoke_table_s; /** * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. * @h_total_credits: Number of remaining buffers we are allowed to add to * journal. These are dirty buffers and revoke descriptor blocks. * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. * @h_jdata: Flag to force data journaling. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. * @h_requested_credits: Holds @h_total_credits after handle is started. * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ /* Docbook can't yet cope with the bit fields, but will leave the documentation * in so it can be fixed later. */ struct jbd2_journal_handle { union { transaction_t *h_transaction; /* Which journal handle belongs to - used iff h_reserved set */ journal_t *h_journal; }; handle_t *h_rsv_handle; int h_total_credits; int h_revoke_credits; int h_revoke_credits_requested; int h_ref; int h_err; /* Flags [no locking] */ unsigned int h_sync: 1; unsigned int h_jdata: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; unsigned int h_line_no: 16; unsigned long h_start_jiffies; unsigned int h_requested_credits; unsigned int saved_alloc_context; }; /* * Some stats for checkpoint phase */ struct transaction_chp_stats_s { unsigned long cs_chp_time; __u32 cs_forced_to_close; __u32 cs_written; __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * * RUNNING: accepting new updates * LOCKED: Updates still running but we don't accept new ones * RUNDOWN: Updates are tidying up but have finished requesting * new buffers to modify (state not used for now) * FLUSH: All updates complete, but we are still writing to disk * COMMIT: All data on disk, writing commit record * FINISHED: We still have to keep the transaction for checkpointing. * * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. * (Locking Documentation improved by LockDoc) */ /* * Lock ranking: * * j_list_lock * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock * ->b_state_lock * * b_state_lock * ->j_list_lock * * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ struct transaction_s { /* Pointer to the journal for this transaction. [no locking] */ journal_t *t_journal; /* Sequence number for this transaction [no locking] */ tid_t t_tid; /* * Transaction's current state * [no locking - only kjournald2 alters this] * [j_list_lock] guards transition of a transaction into T_FINISHED * state and subsequent call of __jbd2_journal_drop_transaction() * FIXME: needs barriers * KLUDGE: [use j_state_lock] */ enum { T_RUNNING, T_LOCKED, T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, T_COMMIT_CALLBACK, T_FINISHED } t_state; /* * Where in the log does this transaction's commit start? [no locking] */ unsigned long t_log_start; /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet * modified by this transaction [j_list_lock, no locks needed fo * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] */ struct journal_head *t_forget; /* * Doubly-linked circular list of all buffers still to be flushed before * this transaction can be checkpointed. [j_list_lock] */ struct journal_head *t_checkpoint_list; /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and * the shadow buffers on this list match each other one for * one at all times. [j_list_lock, no locks needed for jbd2 * thread] */ struct journal_head *t_shadow_list; /* * List of inodes associated with the transaction; e.g., ext4 uses * this to track inodes in data=ordered and data=journal mode that * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; /* * Longest time some handle had to wait for running transaction */ unsigned long t_max_wait; /* * When transaction started */ unsigned long t_start; /* * When commit was requested [j_state_lock] */ unsigned long t_requested; /* * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; /* * Number of outstanding updates running on this transaction * [none] */ atomic_t t_updates; /* * Number of blocks reserved for this transaction in the journal. * This is including all credits reserved when starting transaction * handles as well as all journal descriptor blocks needed for this * transaction. [none] */ atomic_t t_outstanding_credits; /* * Number of revoke records for this transaction added by already * stopped handles. [none] */ atomic_t t_outstanding_revokes; /* * How many handles used this transaction? [none] */ atomic_t t_handle_count; /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] */ transaction_t *t_cpnext, *t_cpprev; /* * When will the transaction expire (become due for commit), in jiffies? * [no locking] */ unsigned long t_expires; /* * When this transaction started, in nanoseconds [no locking] */ ktime_t t_start_time; /* * This transaction is being forced and some process is * waiting for it to finish. */ unsigned int t_synchronous_commit:1; /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; /* * For use by the filesystem to store fs-specific data * structures associated with the transaction */ struct list_head t_private_list; }; struct transaction_run_stats_s { unsigned long rs_wait; unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; unsigned long rs_logging; __u32 rs_handle_count; __u32 rs_blocks; __u32 rs_blocks_logged; }; struct transaction_stats_s { unsigned long ts_tid; unsigned long ts_requested; struct transaction_run_stats_s run; }; static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { if (end >= start) return end - start; return end + (MAX_JIFFY_OFFSET - start); } #define JBD2_NR_BATCH 64 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; #define JBD2_FC_REPLAY_STOP 0 #define JBD2_FC_REPLAY_CONTINUE 1 /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. */ struct journal_s { /** * @j_flags: General journaling state flags [j_state_lock, * no lock for quick racy checks] */ unsigned long j_flags; /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior * abort)? [j_state_lock] */ int j_errno; /** * @j_abort_mutex: Lock the whole aborting procedure. */ struct mutex j_abort_mutex; /** * @j_sb_buffer: The first part of the superblock buffer. */ struct buffer_head *j_sb_buffer; /** * @j_superblock: The second part of the superblock buffer. */ journal_superblock_t *j_superblock; /** * @j_state_lock: Protect the various scalars in the journal. */ rwlock_t j_state_lock; /** * @j_barrier_count: * * Number of processes waiting to create a barrier lock [j_state_lock, * no lock for quick racy checks] */ int j_barrier_count; /** * @j_barrier: The barrier lock itself. */ struct mutex j_barrier; /** * @j_running_transaction: * * Transactions: The current running transaction... * [j_state_lock, no lock for quick racy checks] [caller holding * open handle] */ transaction_t *j_running_transaction; /** * @j_committing_transaction: * * the transaction we are pushing to disk * [j_state_lock] [caller holding open handle] */ transaction_t *j_committing_transaction; /** * @j_checkpoint_transactions: * * ... and a linked circular list of all transactions waiting for * checkpointing. [j_list_lock] */ transaction_t *j_checkpoint_transactions; /** * @j_wait_transaction_locked: * * Wait queue for waiting for a locked transaction to start committing, * or for a barrier lock to be released. */ wait_queue_head_t j_wait_transaction_locked; /** * @j_wait_done_commit: Wait queue for waiting for commit to complete. */ wait_queue_head_t j_wait_done_commit; /** * @j_wait_commit: Wait queue to trigger commit. */ wait_queue_head_t j_wait_commit; /** * @j_wait_updates: Wait queue to wait for updates to complete. */ wait_queue_head_t j_wait_updates; /** * @j_wait_reserved: * * Wait queue to wait for reserved buffer credits to drop. */ wait_queue_head_t j_wait_reserved; /** * @j_fc_wait: * * Wait queue to wait for completion of async fast commits. */ wait_queue_head_t j_fc_wait; /** * @j_checkpoint_mutex: * * Semaphore for locking against concurrent checkpoints. */ struct mutex j_checkpoint_mutex; /** * @j_chkpt_bhs: * * List of buffer heads used by the checkpoint routine. This * was moved from jbd2_log_do_checkpoint() to reduce stack * usage. Access to this array is controlled by the * @j_checkpoint_mutex. [j_checkpoint_mutex] */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; /** * @j_shrinker: * * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ struct percpu_counter j_checkpoint_jh_count; /** * @j_shrink_transaction: * * Record next transaction will shrink on the checkpoint list. * [j_list_lock] */ transaction_t *j_shrink_transaction; /** * @j_head: * * Journal head: identifies the first unused block in the journal. * [j_state_lock] */ unsigned long j_head; /** * @j_tail: * * Journal tail: identifies the oldest still-used block in the journal. * [j_state_lock] */ unsigned long j_tail; /** * @j_free: * * Journal free: how many free blocks are there in the journal? * [j_state_lock] */ unsigned long j_free; /** * @j_first: * * The block number of the first usable block in the journal * [j_state_lock]. */ unsigned long j_first; /** * @j_last: * * The block number one beyond the last usable block in the journal * [j_state_lock]. */ unsigned long j_last; /** * @j_fc_first: * * The block number of the first fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_first; /** * @j_fc_off: * * Number of fast commit blocks currently allocated. Accessed only * during fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ unsigned long j_fc_off; /** * @j_fc_last: * * The block number one beyond the last fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_last; /** * @j_dev: Device where we store the journal. */ struct block_device *j_dev; /** * @j_blocksize: Block size for the location where we store the journal. */ int j_blocksize; /** * @j_blk_offset: * * Starting block offset into the device where we store the journal. */ unsigned long long j_blk_offset; /** * @j_devname: Journal device name. */ char j_devname[BDEVNAME_SIZE+24]; /** * @j_fs_dev: * * Device which holds the client fs. For internal journal this will be * equal to j_dev. */ struct block_device *j_fs_dev; /** * @j_fs_dev_wb_err: * * Records the errseq of the client fs's backing block device. */ errseq_t j_fs_dev_wb_err; /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; /** * @j_reserved_credits: * * Number of buffers reserved from the running transaction. */ atomic_t j_reserved_credits; /** * @j_list_lock: Protects the buffer lists and internal buffer state. */ spinlock_t j_list_lock; /** * @j_inode: * * Optional inode where we store the journal. If present, all * journal block numbers are mapped into this inode via bmap(). */ struct inode *j_inode; /** * @j_tail_sequence: * * Sequence number of the oldest transaction in the log [j_state_lock] */ tid_t j_tail_sequence; /** * @j_transaction_sequence: * * Sequence number of the next transaction to grant [j_state_lock] */ tid_t j_transaction_sequence; /** * @j_commit_sequence: * * Sequence number of the most recently committed transaction * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; /** * @j_commit_request: * * Sequence number of the most recent transaction wanting commit * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; /** * @j_uuid: * * Journal uuid: identifies the object (filesystem, LVM volume etc) * backed by this journal. This will eventually be replaced by an array * of uuids, allowing us to index multiple devices within a single * journal and to perform atomic updates across them. */ __u8 j_uuid[16]; /** * @j_task: Pointer to the current commit thread for this journal. */ struct task_struct *j_task; /** * @j_max_transaction_buffers: * * Maximum number of metadata buffers to allow in a single compound * commit transaction. */ int j_max_transaction_buffers; /** * @j_revoke_records_per_block: * * Number of revoke records that fit in one descriptor block. */ int j_revoke_records_per_block; /** * @j_commit_interval: * * What is the maximum transaction lifetime before we begin a commit? */ unsigned long j_commit_interval; /** * @j_commit_timer: The timer used to wakeup the commit thread. */ struct timer_list j_commit_timer; /** * @j_revoke_lock: Protect the revoke table. */ spinlock_t j_revoke_lock; /** * @j_revoke: * * The revoke table - maintains the list of revoked blocks in the * current transaction. */ struct jbd2_revoke_table_s *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ struct jbd2_revoke_table_s *j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. */ struct buffer_head **j_wbuf; /** * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only * during a fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; /** * @j_wbufsize: * * Size of @j_wbuf array. */ int j_wbufsize; /** * @j_fc_wbufsize: * * Size of @j_fc_wbuf array. */ int j_fc_wbufsize; /** * @j_last_sync_writer: * * The pid of the last person to run a synchronous operation * through the journal. */ pid_t j_last_sync_writer; /** * @j_average_commit_time: * * The average amount of time in nanoseconds it takes to commit a * transaction to disk. [j_state_lock] */ u64 j_average_commit_time; /** * @j_min_batch_time: * * Minimum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_min_batch_time; /** * @j_max_batch_time: * * Maximum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_max_batch_time; /** * @j_commit_callback: * * This function is called when a transaction is closed. */ void (*j_commit_callback)(journal_t *, transaction_t *); /** * @j_submit_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WRITE_DATA flag * before we start to write out the transaction to the journal. */ int (*j_submit_inode_data_buffers) (struct jbd2_inode *); /** * @j_finish_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WAIT_DATA flag * after we have written the transaction to the journal * but before we write out the commit block. */ int (*j_finish_inode_data_buffers) (struct jbd2_inode *); /* * Journal statistics */ /** * @j_history_lock: Protect the transactions statistics history. */ spinlock_t j_history_lock; /** * @j_proc_entry: procfs entry for the jbd statistics directory. */ struct proc_dir_entry *j_proc_entry; /** * @j_stats: Overall statistics. */ struct transaction_stats_s j_stats; /** * @j_failed_commit: Failed journal commit ID. */ unsigned int j_failed_commit; /** * @j_private: * * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here. */ void *j_private; /** * @j_chksum_driver: * * Reference to checksum algorithm driver via cryptoapi. */ struct crypto_shash *j_chksum_driver; /** * @j_csum_seed: * * Precomputed journal UUID checksum for seeding other checksums. */ __u32 j_csum_seed; #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * @j_trans_commit_map: * * Lockdep entity to track transaction commit dependencies. Handles * hold this "lock" for read, when we wait for commit, we acquire the * "lock" for writing. This matches the properties of jbd2 journalling * where the running transaction has to wait for all handles to be * dropped to commit that transaction and also acquiring a handle may * require transaction commit to finish. */ struct lockdep_map j_trans_commit_map; #endif /** * @j_fc_cleanup_callback: * * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast * commit. JBD2 calls this function for each fast commit block found in * the journal. This function should return JBD2_FC_REPLAY_CONTINUE * to indicate that the block was processed correctly and more fast * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP * indicates the end of replay (no more blocks remaining). A negative * return value indicates error. */ int (*j_fc_replay_callback)(struct journal_s *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); /** * @j_bmap: * * Bmap function that should be used instead of the generic * VFS bmap function. */ int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* * We can support any known requested features iff the * superblock is not in version 1. Otherwise we fail to support any * extended sb features. */ static inline bool jbd2_format_support_feature(journal_t *j) { return j->j_superblock->s_header.h_blocktype != cpu_to_be32(JBD2_SUPERBLOCK_V1); } /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat |= \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat &= \ ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat |= \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat &= \ ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat |= \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat &= \ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* Journal high priority write IO operation flags */ #define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) /* * Journal flag definitions */ #define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ #define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ #define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer * management */ /* Filing buffers */ extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); } static inline void jbd2_unfile_log_bh(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); } /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); /* * Triggers */ struct jbd2_buffer_trigger_type { /* * Fired a the moment data to write to the journal are known to be * stable - so either at the moment b_frozen_data is created or just * before a buffer is written to the journal. mapped_data is a mapped * buffer that is the frozen data for commit. */ void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); /* * Fired during journal abort for dirty buffers that will not be * committed. */ void (*t_abort)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh); }; extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers); /* Buffer IO */ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr); /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); extern void jbd2_journal_free_transaction(transaction_t *); /* * Journal locking. * * We need to lock the journal during transaction state changes so that nobody * ever tries to take a handle on the running transaction while we are in the * middle of moving it to the commit phase. j_state_lock does this. * * Note that the locking is completely interrupt unsafe. We never touch * journal structures from interrupts. */ static inline handle_t *journal_current_handle(void) { return current->journal_info; } /* The journaling code user interface: * * Create and destroy handles * Register buffer modifications against the current transaction. */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); extern int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); void jbd2_journal_wait_updates(journal_t *); extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); extern journal_t * jbd2_journal_init_inode (struct inode *); extern int jbd2_journal_update_format (journal_t *); extern int jbd2_journal_check_used_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, blk_opf_t); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* * handle management */ extern struct kmem_cache *jbd2_handle_cache; static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) { return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); } static inline void jbd2_free_handle(handle_t *handle) { kmem_cache_free(jbd2_handle_cache, handle); } /* * jbd2_inode management (optional, for those file systems that want to use * dynamically allocated jbd2_inode structures) */ extern struct kmem_cache *jbd2_inode_cache; static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags) { return kmem_cache_alloc(jbd2_inode_cache, gfp_flags); } static inline void jbd2_free_inode(struct jbd2_inode *jinode) { kmem_cache_free(jbd2_inode_cache, jinode); } /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: * * Request space in the current transaction, and force transaction commit * transitions on demand. */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) { return (journal->j_total_len - journal->j_fc_wbufsize) / 4; } /* * is_journal_abort * * Simple test wrapper function to test the JBD2_ABORT state flag. This * bit, when set, indicates that we have had a fatal error somewhere, * either inside the journaling layer or indicated to us by the client * (eg. ext3), and that we and should not commit any further * transactions. */ static inline int is_journal_aborted(journal_t *journal) { return journal->j_flags & JBD2_ABORT; } static inline int is_handle_aborted(handle_t *handle) { if (handle->h_aborted || !handle->h_transaction) return 1; return is_journal_aborted(handle->h_transaction->t_journal); } static inline void jbd2_journal_abort_handle(handle_t *handle) { handle->h_aborted = 1; } static inline void jbd2_init_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; /* * Save the original wb_err value of client fs's bdev mapping which * could be used to detect the client fs's metadata async write error. */ errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); } static inline int jbd2_check_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; return errseq_check(&mapping->wb_err, READ_ONCE(journal->j_fs_dev_wb_err)); } #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using * modulo arithmetic so that they work over sequence number wraps. */ static inline int tid_gt(tid_t x, tid_t y) { int difference = (x - y); return (difference > 0); } static inline int tid_geq(tid_t x, tid_t y) { int difference = (x - y); return (difference >= 0); } extern int jbd2_journal_blocks_per_page(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) { return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j); } static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) && journal->j_chksum_driver == NULL); return journal->j_chksum_driver != NULL; } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) { int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; } /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ long free = journal->j_free - 32; if (journal->j_committing_transaction) { free -= atomic_read(&journal-> j_committing_transaction->t_outstanding_credits); } return max_t(long, free, 0); } /* * Definitions which augment the buffer_head layer */ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ #define BJ_Metadata 1 /* Normal journaled metadata */ #define BJ_Forget 2 /* Buffer superseded by this transaction */ #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 /* JBD uses a CRC32 checksum */ #define JBD_MAX_CHECKSUM_SIZE 4 static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; char ctx[JBD_MAX_CHECKSUM_SIZE]; } desc; int err; BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > JBD_MAX_CHECKSUM_SIZE); desc.shash.tfm = journal->j_chksum_driver; *(u32 *)desc.ctx = crc; err = crypto_shash_update(&desc.shash, address, length); BUG_ON(err); return *(u32 *)desc.ctx; } /* Return most recent uncommitted transaction */ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_commit_request; if (journal->j_running_transaction) tid = journal->j_running_transaction->t_tid; read_unlock(&journal->j_state_lock); return tid; } static inline int jbd2_handle_buffer_credits(handle_t *handle) { journal_t *journal; if (!handle->h_reserved) journal = handle->h_transaction->t_journal; else journal = handle->h_journal; return handle->h_total_credits - DIV_ROUND_UP(handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); } #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) #define print_buffer_fields(bh) do {} while (0) #define print_buffer_trace(bh) do {} while (0) #define BUFFER_TRACE(bh, info) do {} while (0) #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _LINUX_JBD2_H */ |
| 158 158 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ #ifndef _LINUX_RSEQ_H #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ #include <linux/preempt.h> #include <linux/sched.h> /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif #endif /* _LINUX_RSEQ_H */ |
| 51 51 39 39 51 51 51 51 51 34 51 51 51 51 51 51 51 51 51 51 34 51 51 51 51 51 51 51 51 51 51 51 51 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/gen_stats.c * * Authors: Thomas Graf <tgraf@suug.ch> * Jamal Hadi Salim * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * See Documentation/networking/gen_stats.rst */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/rtnetlink.h> #include <linux/gen_stats.h> #include <net/netlink.h> #include <net/gen_stats.h> #include <net/sch_generic.h> static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) { if (nla_put_64bit(d->skb, type, size, buf, padattr)) goto nla_put_failure; return 0; nla_put_failure: if (d->lock) spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; return -1; } /** * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV * @xstats_type: TLV type for backward compatibility xstats TLV * @lock: statistics lock * @d: dumping handle * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all * other statistic TLVS. * * The dumping handle is marked to be in backward compatibility mode telling * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. * * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. */ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, int xstats_type, spinlock_t *lock, struct gnet_dump *d, int padattr) __acquires(lock) { memset(d, 0, sizeof(*d)); if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; d->padattr = padattr; if (lock) { d->lock = lock; spin_lock_bh(lock); } if (d->tail) { int ret = gnet_stats_copy(d, type, NULL, 0, padattr); /* The initial attribute added in gnet_stats_copy() may be * preceded by a padding attribute, in which case d->tail will * end up pointing at the padding instead of the real attribute. * Fix this so gnet_stats_finish_copy() adjusts the length of * the right attribute. */ if (ret == 0 && d->tail->nla_type == padattr) d->tail = (struct nlattr *)((char *)d->tail + NLA_ALIGN(d->tail->nla_len)); return ret; } return 0; } EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** * gnet_stats_start_copy - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @lock: statistics lock * @d: dumping handle * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all * other statistic TLVS. * * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. */ int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr) { return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); } EXPORT_SYMBOL(gnet_stats_start_copy); /* Must not be inlined, due to u64_stats seqcount_t lockdep key */ void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) { u64_stats_set(&b->bytes, 0); u64_stats_set(&b->packets, 0); u64_stats_init(&b->syncp); } EXPORT_SYMBOL(gnet_stats_basic_sync_init); static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu) { u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; } _bstats_update(bstats, t_bytes, t_packets); } void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running) { unsigned int start; u64 bytes = 0; u64 packets = 0; WARN_ON_ONCE((cpu || running) && in_hardirq()); if (cpu) { gnet_stats_add_basic_cpu(bstats, cpu); return; } do { if (running) start = u64_stats_fetch_begin(&b->syncp); bytes = u64_stats_read(&b->bytes); packets = u64_stats_read(&b->packets); } while (running && u64_stats_fetch_retry(&b->syncp, start)); _bstats_update(bstats, bytes, packets); } EXPORT_SYMBOL(gnet_stats_add_basic); static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running) { unsigned int start; if (cpu) { u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; } *ret_bytes = t_bytes; *ret_packets = t_packets; return; } do { if (running) start = u64_stats_fetch_begin(&b->syncp); *ret_bytes = u64_stats_read(&b->bytes); *ret_packets = u64_stats_read(&b->packets); } while (running && u64_stats_fetch_retry(&b->syncp, start)); } static int ___gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, int type, bool running) { u64 bstats_bytes, bstats_packets; gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); if (d->compat_tc_stats && type == TCA_STATS_BASIC) { d->tc_stats.bytes = bstats_bytes; d->tc_stats.packets = bstats_packets; } if (d->tail) { struct gnet_stats_basic sb; int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats_bytes; sb.packets = bstats_packets; res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); if (res < 0 || sb.packets == bstats_packets) return res; /* emit 64bit stats only if needed */ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, sizeof(bstats_packets), TCA_STATS_PAD); } return 0; } /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics * @running: true if @b represents a running qdisc, thus @b's * internal values might change during basic reads. * Only used if @cpu is NULL * * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running) { return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics * @running: true if @b represents a running qdisc, thus @b's * internal values might change during basic reads. * Only used if @cpu is NULL * * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_basic_hw(struct gnet_dump *d, struct gnet_stats_basic_sync __percpu *cpu, struct gnet_stats_basic_sync *b, bool running) { return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); } EXPORT_SYMBOL(gnet_stats_copy_basic_hw); /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle * @rate_est: rate estimator * * Appends the rate estimator statistics to the top level TLV created by * gnet_stats_start_copy(). * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **rate_est) { struct gnet_stats_rate_est64 sample; struct gnet_stats_rate_est est; int res; if (!gen_estimator_read(rate_est, &sample)) return 0; est.bps = min_t(u64, UINT_MAX, sample.bps); /* we have some time before reaching 2^32 packets per second */ est.pps = sample.pps; if (d->compat_tc_stats) { d->tc_stats.bps = est.bps; d->tc_stats.pps = est.pps; } if (d->tail) { res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), TCA_STATS_PAD); if (res < 0 || est.bps == sample.bps) return res; /* emit 64bit stats only if needed */ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample, sizeof(sample), TCA_STATS_PAD); } return 0; } EXPORT_SYMBOL(gnet_stats_copy_rate_est); static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, const struct gnet_stats_queue __percpu *q) { int i; for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); qstats->qlen += qcpu->qlen; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; qstats->overlimits += qcpu->overlimits; } } void gnet_stats_add_queue(struct gnet_stats_queue *qstats, const struct gnet_stats_queue __percpu *cpu, const struct gnet_stats_queue *q) { if (cpu) { gnet_stats_add_queue_cpu(qstats, cpu); } else { qstats->qlen += q->qlen; qstats->backlog += q->backlog; qstats->drops += q->drops; qstats->requeues += q->requeues; qstats->overlimits += q->overlimits; } } EXPORT_SYMBOL(gnet_stats_add_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle * @cpu_q: per cpu queue statistics * @q: queue statistics * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by * gnet_stats_start_copy(). Using per cpu queue statistics if * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen) { struct gnet_stats_queue qstats = {0}; gnet_stats_add_queue(&qstats, cpu_q, q); qstats.qlen = qlen; if (d->compat_tc_stats) { d->tc_stats.drops = qstats.drops; d->tc_stats.qlen = qstats.qlen; d->tc_stats.backlog = qstats.backlog; d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) return gnet_stats_copy(d, TCA_STATS_QUEUE, &qstats, sizeof(qstats), TCA_STATS_PAD); return 0; } EXPORT_SYMBOL(gnet_stats_copy_queue); /** * gnet_stats_copy_app - copy application specific statistics into statistics TLV * @d: dumping handle * @st: application specific statistics data * @len: length of data * * Appends the application specific statistics to the top level TLV created by * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping * handle is in backward compatibility mode. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) { if (d->compat_xstats) { d->xstats = kmemdup(st, len, GFP_ATOMIC); if (!d->xstats) goto err_out; d->xstats_len = len; } if (d->tail) return gnet_stats_copy(d, TCA_STATS_APP, st, len, TCA_STATS_PAD); return 0; err_out: if (d->lock) spin_unlock_bh(d->lock); d->xstats_len = 0; return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); /** * gnet_stats_finish_copy - finish dumping procedure * @d: dumping handle * * Corrects the length of the top level TLV to include all TLVs added * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs * if gnet_stats_start_copy_compat() was used and releases the statistics * lock. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int gnet_stats_finish_copy(struct gnet_dump *d) { if (d->tail) d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, sizeof(d->tc_stats), d->padattr) < 0) return -1; if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats, d->xstats_len, d->padattr) < 0) return -1; } if (d->lock) spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); |
| 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, * with configurable resolution, support for restricting the cpus on * which profiling is done, and switching between cpu time and * schedule() calls via kernel command line parameters passed at boot. * * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, * Red Hat, July 2004 * Consolidation of architecture support code for profiling, * Nadia Yvette Chambers, Oracle, July 2004 * Amortized hit count accounting via per-cpu open-addressed hashtables * to resolve timer interrupt livelocks, Nadia Yvette Chambers, * Oracle, 2004 */ #include <linux/export.h> #include <linux/profile.h> #include <linux/memblock.h> #include <linux/notifier.h> #include <linux/mm.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sched/stat.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> struct profile_hit { u32 pc, hits; }; #define PROFILE_GRPSHIFT 3 #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; static unsigned long prof_len; static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ int profile_setup(char *str) { static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } if (select) { if (str[strlen(select)] == ',') str += strlen(select) + 1; if (get_option(&str, &par)) prof_shift = clamp(par, 0, BITS_PER_LONG - 1); pr_info("kernel %s profiling enabled (shift: %u)\n", select, prof_shift); } return 1; } __setup("profile=", profile_setup); int __ref profile_init(void) { int buffer_bytes; if (!prof_on) return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; if (!prof_len) { pr_warn("profiling shift: %u too large\n", prof_shift); prof_on = 0; return -EINVAL; } buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; cpumask_copy(prof_cpu_mask, cpu_possible_mask); prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); if (prof_buffer) return 0; prof_buffer = vzalloc(buffer_bytes); if (prof_buffer) return 0; free_cpumask_var(prof_cpu_mask); return -ENOMEM; } #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them * to flip buffers and flushes their contents to prof_buffer itself. * Flip requests are serialized by the profile_flip_mutex. The sole * use of having a second hashtable is for avoiding cacheline * contention that would otherwise happen during flushes of pending * profile hits required for the accuracy of reported profile hits * and so resurrect the interrupt livelock issue. * * The open-addressed hashtables are indexed by profile buffer slot * and hold the number of pending hits to that profile buffer slot on * a cpu in an entry. When the hashtable overflows, all pending hits * are accounted to their corresponding profile buffer slots with * atomic_add() and the hashtable emptied. As numerous pending hits * may be accounted to a profile buffer slot in a hashtable entry, * this amortizes a number of atomic profile buffer increments likely * to be far larger than the number of entries in the hashtable, * particularly given that the number of distinct profile buffer * positions to which hits are accounted during short intervals (e.g. * several seconds) is usually very small. Exclusion from buffer * flipping is provided by interrupt disablement (note that for * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from * process context). * The hash function is meant to be lightweight as opposed to strong, * and was vaguely inspired by ppc64 firmware-supported inverted * pagetable hash functions, but uses a full hashtable full of finite * collision chains, not just pairs of them. * * -- nyc */ static void __profile_flip_buffers(void *unused) { int cpu = smp_processor_id(); per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); } static void profile_flip_buffers(void) { int i, j, cpu; mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; for (i = 0; i < NR_PROFILE_HIT; ++i) { if (!hits[i].hits) { if (hits[i].pc) hits[i].pc = 0; continue; } atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].hits = hits[i].pc = 0; } } mutex_unlock(&profile_flip_mutex); } static void profile_discard_flip_buffers(void) { int i, cpu; mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 1); for_each_online_cpu(cpu) { struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); } mutex_unlock(&profile_flip_mutex); } static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; struct profile_hit *hits; pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; cpu = get_cpu(); hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; if (!hits) { put_cpu(); return; } /* * We buffer the global profiler buffer into a per-CPU * queue and thus reduce the number of global (and possibly * NUMA-alien) accesses. The write-queue is self-coalescing: */ local_irq_save(flags); do { for (j = 0; j < PROFILE_GRPSZ; ++j) { if (hits[i + j].pc == pc) { hits[i + j].hits += nr_hits; goto out; } else if (!hits[i + j].hits) { hits[i + j].pc = pc; hits[i + j].hits = nr_hits; goto out; } } i = (i + secondary) & (NR_PROFILE_HIT - 1); } while (i != primary); /* * Add the current hit(s) and flush the write-queue out * to the global buffer: */ atomic_add(nr_hits, &prof_buffer[pc]); for (i = 0; i < NR_PROFILE_HIT; ++i) { atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].pc = hits[i].hits = 0; } out: local_irq_restore(flags); put_cpu(); } static int profile_dead_cpu(unsigned int cpu) { struct page *page; int i; if (cpumask_available(prof_cpu_mask)) cpumask_clear_cpu(cpu, prof_cpu_mask); for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); per_cpu(cpu_profile_hits, cpu)[i] = NULL; __free_page(page); } } return 0; } static int profile_prepare_cpu(unsigned int cpu) { int i, node = cpu_to_mem(cpu); struct page *page; per_cpu(cpu_profile_flip, cpu) = 0; for (i = 0; i < 2; i++) { if (per_cpu(cpu_profile_hits, cpu)[i]) continue; page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) { profile_dead_cpu(cpu); return -ENOMEM; } per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); } return 0; } static int profile_online_cpu(unsigned int cpu) { if (cpumask_available(prof_cpu_mask)) cpumask_set_cpu(cpu, prof_cpu_mask); return 0; } #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { if (prof_on != type || !prof_buffer) return; do_profile_hits(type, __pc, nr_hits); } EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uaccess.h> static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); return 0; } static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) { return single_open(file, prof_cpu_mask_proc_show, NULL); } static ssize_t prof_cpu_mask_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { cpumask_var_t new_value; int err; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (!err) { cpumask_copy(prof_cpu_mask, new_value); err = count; } free_cpumask_var(new_value); return err; } static const struct proc_ops prof_cpu_mask_proc_ops = { .proc_open = prof_cpu_mask_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_write = prof_cpu_mask_proc_write, }; void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); } /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile * buffer. Use of the program readprofile is recommended in order to * get meaningful info out of these data. */ static ssize_t read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; ssize_t read; char *pnt; unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) count = (prof_len+1)*sizeof(unsigned int) - p; read = 0; while (p < sizeof(unsigned int) && count > 0) { if (put_user(*((char *)(&sample_step)+p), buf)) return -EFAULT; buf++; p++; count--; read++; } pnt = (char *)prof_buffer + p - sizeof(atomic_t); if (copy_to_user(buf, (void *)pnt, count)) return -EFAULT; read += count; *ppos += read; return read; } /* default is to not implement this call */ int __weak setup_profiling_timer(unsigned mult) { return -EINVAL; } /* * Writing to /proc/profile resets the counters * * Writing a 'profiling multiplier' value into it also re-sets the profiling * interrupt frequency, on architectures that support this. */ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP if (count == sizeof(int)) { unsigned int multiplier; if (copy_from_user(&multiplier, buf, sizeof(int))) return -EFAULT; if (setup_profiling_timer(multiplier)) return -EINVAL; } #endif profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } static const struct proc_ops profile_proc_ops = { .proc_read = read_profile, .proc_write = write_profile, .proc_lseek = default_llseek, }; int __ref create_proc_profile(void) { struct proc_dir_entry *entry; #ifdef CONFIG_SMP enum cpuhp_state online_state; #endif int err = 0; if (!prof_on) return 0; #ifdef CONFIG_SMP err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", profile_prepare_cpu, profile_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", profile_online_cpu, NULL); if (err < 0) goto err_state_prep; online_state = err; err = 0; #endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); if (!entry) goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); return err; err_state_onl: #ifdef CONFIG_SMP cpuhp_remove_state(online_state); err_state_prep: cpuhp_remove_state(CPUHP_PROFILE_PREPARE); #endif return err; } subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ |
| 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 | /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in * a new subdirectory with this name. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for each * non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned value * will replace static permissions defined in struct attribute. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if a binary attribute is not visible. The returned * value will replace static permissions defined in * struct bin_attribute. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define SYSFS_PREALLOC 010000 #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .write = _name##_write, \ .size = _size, \ } #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0400 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */ |
| 29 7 1208 1208 1 424 3 48 48 48 48 48 42 27 6 6 4 21 48 48 48 21 48 2 1 2 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 6 29 29 29 29 29 29 29 86 355 352 352 271 268 268 271 106 264 5 157 1859 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64 || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } /* called with rtnl held */ static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); return false; } return true; } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", __func__); return NULL; } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ skb_set_owner_w(skb, sk); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? idev->cnf.ndisc_tclass : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce) { int inc_opt = dev->addr_len; struct sk_buff *skb; struct nd_msg *msg; int optlen = 0; if (!saddr) return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } return skb; } EXPORT_SYMBOL(ndisc_ns_create); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct in6_addr addr_buf; struct sk_buff *skb; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } skb = ndisc_ns_create(dev, solicit, saddr, nonce); if (skb) ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); int is_router = -1; SKB_DR(reason); u64 nonce = 0; bool inc; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return reason; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return reason; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return reason; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (idev->cnf.forwarding && (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else { SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; } } if (is_router < 0) is_router = idev->cnf.forwarding; if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); reason = SKB_CONSUMED; } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); switch (idev->cnf.accept_untracked_na) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ return 1; case 2: /* Create new entries from untracked na only if saddr is in the * same subnet as an address configured on the interface that * received the na */ return !!ipv6_chk_prefix(saddr, dev); default: return 0; } } static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; SKB_DR(reason); u8 new_state; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return reason; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); /* RFC 9131 updates original Neighbour Discovery RFC 4861. * NAs with Target LL Address option without a corresponding * entry in the neighbour cache can now create a STALE neighbour * cache entry on routers. * * entry accept fwding solicited behaviour * ------- ------ ------ --------- ---------------------- * present X X 0 Set state to STALE * present X X 1 Set state to REACHABLE * absent 0 X X Do nothing * absent 1 0 X Do nothing * absent 1 1 X Add a new STALE entry * * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; if (!neigh && lladdr && idev && idev->cnf.forwarding) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; } } if (neigh && !IS_ERR(neigh)) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, new_state, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } reason = SKB_CONSUMED; out: neigh_release(neigh); } return reason; } static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return reason; } /* Don't accept RS if we're not in router mode */ if (!idev->cnf.forwarding) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); reason = SKB_CONSUMED; } out: return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; struct net *net; SKB_DR(reason); int lifetime; int optlen; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return reason; } if (optlen < 0) return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } } /* Set default route metric as specified by user */ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; /* delete the route if lifetime is 0 or if metric needs change */ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref, defrtr_usr_metric); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { struct nl_info nlinfo = { .nl_net = net, }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/100) rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; if (ri->lifetime != 0 && ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (in6_dev->ra_mtu != mtu) { in6_dev->ra_mtu = mtu; send_ifinfo_notify = true; } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or * timer settings or ra_mtu value */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); fib6_info_release(rt); if (neigh) neigh_release(neigh); return reason; } static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); struct ndisc_options ndopts; SKB_DR(reason); u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return SKB_DROP_REASON_PKT_TOO_SMALL; return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { enum skb_drop_reason reason = ndisc_recv_ns(skb); kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) { return ipv6_addr_is_multicast((struct in6_addr *)pkey); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && idev->cnf.suppress_frag_ndisc) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: reason = ndisc_redirect_rcv(skb); break; } return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); fallthrough; case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (idev->cnf.ndisc_notify || net->ipv6.devconf_all->ndisc_notify) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: idev = in6_dev_get(dev); if (!idev) evict_nocarrier = true; else { evict_nocarrier = idev->cnf.ndisc_evict_nocarrier && net->ipv6.devconf_all->ndisc_evict_nocarrier; in6_dev_put(idev); } change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ inet6_clear_bit(MC6_LOOP, sk); return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); } |
| 1 1 145 15 14 145 17 17 264 264 264 264 369 369 369 369 368 347 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. These functions are needed by GSO/GRO implementation. */ #include <linux/export.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/addrconf.h> #include <net/secure_seq.h> #include <linux/netfilter.h> static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { return get_random_u32_above(0); } /* This function exists only for tap drivers that must support broken * clients requesting UFO without specifying an IPv6 fragment ID. * * This is similar to ipv6_select_ident() but we use an independent hash * seed to limit information leakage. * * The network header must be set before calling this. */ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { struct in6_addr buf[2]; struct in6_addr *addrs; u32 id; addrs = skb_header_pointer(skb, skb_network_offset(skb) + offsetof(struct ipv6hdr, saddr), sizeof(buf), buf); if (!addrs) return 0; id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); return htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) { u32 id; id = __ipv6_select_ident(net, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: found_rhdr = 1; break; case NEXTHDR_DEST: #if IS_ENABLED(CONFIG_IPV6_MIP6) if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) break; #endif if (found_rhdr) return offset; break; default: return offset; } if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) return -EINVAL; exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); if (offset > IPV6_MAXPLEN) return -EINVAL; *nexthdr = &exthdr->nexthdr; } return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); #if IS_ENABLED(CONFIG_IPV6) int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); if (hoplimit == 0) { struct net_device *dev = dst->dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = idev->cnf.hop_limit; else hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; rcu_read_unlock(); } return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); #endif int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip6_local_out); |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2023 Intel Corporation */ #include <linux/debugfs.h> #include <linux/ieee80211.h> #include "ieee80211_i.h" #include "debugfs.h" #include "debugfs_sta.h" #include "sta_info.h" #include "driver-ops.h" /* sta attributes */ #define STA_READ(name, field, format_string) \ static ssize_t sta_ ##name## _read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct sta_info *sta = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, sta->field); \ } #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } #define STA_OPS_RW(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .write = sta_##name##_write, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } #define STA_FILE(name, field, format) \ STA_READ_##format(name, field) \ STA_OPS(name) STA_FILE(aid, sta.aid, D); static const char * const sta_flag_names[] = { #define FLAG(F) [WLAN_STA_##F] = #F FLAG(AUTH), FLAG(ASSOC), FLAG(PS_STA), FLAG(AUTHORIZED), FLAG(SHORT_PREAMBLE), FLAG(WDS), FLAG(CLEAR_PS_FILT), FLAG(MFP), FLAG(BLOCK_BA), FLAG(PS_DRIVER), FLAG(PSPOLL), FLAG(TDLS_PEER), FLAG(TDLS_PEER_AUTH), FLAG(TDLS_INITIATOR), FLAG(TDLS_CHAN_SWITCH), FLAG(TDLS_OFF_CHANNEL), FLAG(TDLS_WIDER_BW), FLAG(UAPSD), FLAG(SP), FLAG(4ADDR_EVENT), FLAG(INSERTED), FLAG(RATE_CONTROL), FLAG(TOFFSET_KNOWN), FLAG(MPSP_OWNER), FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), FLAG(USES_ENCRYPTION), FLAG(DECAP_OFFLOAD), #undef FLAG }; static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[16 * NUM_WLAN_STA_FLAGS], *pos = buf; char *end = buf + sizeof(buf) - 1; struct sta_info *sta = file->private_data; unsigned int flg; BUILD_BUG_ON(ARRAY_SIZE(sta_flag_names) != NUM_WLAN_STA_FLAGS); for (flg = 0; flg < NUM_WLAN_STA_FLAGS; flg++) { if (test_sta_flag(sta, flg)) pos += scnprintf(pos, end - pos, "%s\n", sta_flag_names[flg]); } return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } STA_OPS(flags); static ssize_t sta_num_ps_buf_frames_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; char buf[17*IEEE80211_NUM_ACS], *p = buf; int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) p += scnprintf(p, sizeof(buf)+buf-p, "AC%d: %d\n", ac, skb_queue_len(&sta->ps_tx_buf[ac]) + skb_queue_len(&sta->tx_filtered[ac])); return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); } STA_OPS(num_ps_buf_frames); static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15*IEEE80211_NUM_TIDS], *p = buf; int i; struct sta_info *sta = file->private_data; for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%x ", le16_to_cpu(sta->last_seq_ctrl[i])); p += scnprintf(p, sizeof(buf)+buf-p, "\n"); return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); } STA_OPS(last_seq_ctrl); #define AQM_TXQ_ENTRY_LEN 130 static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->local; size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2); char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; struct txq_info *txqi; ssize_t rv; int i; if (!buf) return -ENOMEM; spin_lock_bh(&local->fq.lock); rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, "target %uus interval %uus ecn %s\n", codel_time_to_us(sta->cparams.target), codel_time_to_us(sta->cparams.interval), sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz + buf - p, "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, txqi->tin.backlog_bytes, txqi->tin.backlog_packets, txqi->tin.flows, txqi->cstats.drop_count, txqi->cstats.ecn_mark, txqi->tin.overlimit, txqi->tin.collisions, txqi->tin.tx_bytes, txqi->tin.tx_packets, txqi->flags, test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ? "STOP" : "RUN", test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags) ? " AMPDU" : "", test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags) ? " NO-AMSDU" : "", test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } STA_OPS(aqm); static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; s32 deficit[IEEE80211_NUM_ACS]; ssize_t rv; int ac; if (!buf) return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; deficit[ac] = sta->airtime[ac].deficit; spin_unlock_bh(&local->active_txq_lock[ac]); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" "Deficit: VO: %d us VI: %d us BE: %d us BK: %d us\n", rx_airtime, tx_airtime, sta->airtime_weight, deficit[0], deficit[1], deficit[2], deficit[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].rx_airtime = 0; sta->airtime[ac].tx_airtime = 0; sta->airtime[ac].deficit = sta->airtime_weight; spin_unlock_bh(&local->active_txq_lock[ac]); } return count; } STA_OPS_RW(airtime); static ssize_t sta_aql_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct ieee80211_local *local = sta->sdata->local; size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u32 q_depth[IEEE80211_NUM_ACS]; u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS]; ssize_t rv; int ac; if (!buf) return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { spin_lock_bh(&local->active_txq_lock[ac]); q_limit_l[ac] = sta->airtime[ac].aql_limit_low; q_limit_h[ac] = sta->airtime[ac].aql_limit_high; spin_unlock_bh(&local->active_txq_lock[ac]); q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } p += scnprintf(p, bufsz + buf - p, "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n" "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n", q_depth[0], q_depth[1], q_depth[2], q_depth[3], q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1], q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return rv; } static ssize_t sta_aql_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; u32 ac, q_limit_l, q_limit_h; char _buf[100] = {}, *buf = _buf; if (count > sizeof(_buf)) return -EINVAL; if (copy_from_user(buf, userbuf, count)) return -EFAULT; buf[sizeof(_buf) - 1] = '\0'; if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h) != 3) return -EINVAL; if (ac >= IEEE80211_NUM_ACS) return -EINVAL; sta->airtime[ac].aql_limit_low = q_limit_l; sta->airtime[ac].aql_limit_high = q_limit_h; return count; } STA_OPS_RW(aql); static ssize_t sta_agg_status_do_read(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsz, void *data) { struct sta_info *sta = data; char *p = buf; int i; struct tid_ampdu_rx *tid_rx; struct tid_ampdu_tx *tid_tx; p += scnprintf(p, bufsz + buf - p, "next dialog_token: %#02x\n", sta->ampdu_mlme.dialog_token_allocator + 1); p += scnprintf(p, bufsz + buf - p, "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { bool tid_rx_valid; tid_rx = wiphy_dereference(wiphy, sta->ampdu_mlme.tid_rx[i]); tid_tx = wiphy_dereference(wiphy, sta->ampdu_mlme.tid_tx[i]); tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid); p += scnprintf(p, bufsz + buf - p, "%02d", i); p += scnprintf(p, bufsz + buf - p, "\t\t%x", tid_rx_valid); p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_rx_valid ? sta->ampdu_mlme.tid_rx_token[i] : 0); p += scnprintf(p, bufsz + buf - p, "\t%#.3x", tid_rx ? tid_rx->ssn : 0); p += scnprintf(p, bufsz + buf - p, "\t\t%x", !!tid_tx); p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_tx ? tid_tx->dialog_token : 0); p += scnprintf(p, bufsz + buf - p, "\t%03d", tid_tx ? skb_queue_len(&tid_tx->pending) : 0); p += scnprintf(p, bufsz + buf - p, "\n"); } return p - buf; } static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct wiphy *wiphy = sta->local->hw.wiphy; size_t bufsz = 71 + IEEE80211_NUM_TIDS * 40; char *buf = kmalloc(bufsz, GFP_KERNEL); ssize_t ret; if (!buf) return -ENOMEM; ret = wiphy_locked_debugfs_read(wiphy, file, buf, bufsz, userbuf, count, ppos, sta_agg_status_do_read, sta); kfree(buf); return ret; } static ssize_t sta_agg_status_do_write(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data) { struct sta_info *sta = data; bool start, tx; unsigned long tid; char *pos = buf; int ret, timeout = 5000; buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (!strcmp(buf, "tx")) tx = true; else if (!strcmp(buf, "rx")) tx = false; else return -EINVAL; buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (!strcmp(buf, "start")) { start = true; if (!tx) return -EINVAL; } else if (!strcmp(buf, "stop")) { start = false; } else { return -EINVAL; } buf = strsep(&pos, " "); if (!buf) return -EINVAL; if (sscanf(buf, "timeout=%d", &timeout) == 1) { buf = strsep(&pos, " "); if (!buf || !tx || !start) return -EINVAL; } ret = kstrtoul(buf, 0, &tid); if (ret || tid >= IEEE80211_NUM_TIDS) return -EINVAL; if (tx) { if (start) ret = ieee80211_start_tx_ba_session(&sta->sta, tid, timeout); else ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); } else { __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3, true); ret = 0; } return ret ?: count; } static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct sta_info *sta = file->private_data; struct wiphy *wiphy = sta->local->hw.wiphy; char _buf[26]; return wiphy_locked_debugfs_write(wiphy, file, _buf, sizeof(_buf), userbuf, count, sta_agg_status_do_write, sta); } STA_OPS_RW(agg_status); /* link sta attributes */ #define LINK_STA_OPS(name) \ static const struct file_operations link_sta_ ##name## _ops = { \ .read = link_sta_##name##_read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct link_sta_info *link_sta = file->private_data; u8 mac[3 * ETH_ALEN + 1]; snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr); return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN); } LINK_STA_OPS(addr); static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { #define PRINT_HT_CAP(_cond, _str) \ do { \ if (_cond) \ p += scnprintf(p, bufsz + buf - p, "\t" _str "\n"); \ } while (0) char *buf, *p; int i; ssize_t bufsz = 512; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap; ssize_t ret; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsz + buf - p, "ht %ssupported\n", htc->ht_supported ? "" : "not "); if (htc->ht_supported) { p += scnprintf(p, bufsz + buf - p, "cap: %#.4x\n", htc->cap); PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC"); PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40"); PRINT_HT_CAP(!(htc->cap & BIT(1)), "HT20"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 0, "Static SM Power Save"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 1, "Dynamic SM Power Save"); PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 3, "SM Power Save disabled"); PRINT_HT_CAP((htc->cap & BIT(4)), "RX Greenfield"); PRINT_HT_CAP((htc->cap & BIT(5)), "RX HT20 SGI"); PRINT_HT_CAP((htc->cap & BIT(6)), "RX HT40 SGI"); PRINT_HT_CAP((htc->cap & BIT(7)), "TX STBC"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 0, "No RX STBC"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 1, "RX STBC 1-stream"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 2, "RX STBC 2-streams"); PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 3, "RX STBC 3-streams"); PRINT_HT_CAP((htc->cap & BIT(10)), "HT Delayed Block Ack"); PRINT_HT_CAP(!(htc->cap & BIT(11)), "Max AMSDU length: " "3839 bytes"); PRINT_HT_CAP((htc->cap & BIT(11)), "Max AMSDU length: " "7935 bytes"); /* * For beacons and probe response this would mean the BSS * does or does not allow the usage of DSSS/CCK HT40. * Otherwise it means the STA does or does not use * DSSS/CCK HT40. */ PRINT_HT_CAP((htc->cap & BIT(12)), "DSSS/CCK HT40"); PRINT_HT_CAP(!(htc->cap & BIT(12)), "No DSSS/CCK HT40"); /* BIT(13) is reserved */ PRINT_HT_CAP((htc->cap & BIT(14)), "40 MHz Intolerant"); PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection"); p += scnprintf(p, bufsz + buf - p, "ampdu factor/density: %d/%d\n", htc->ampdu_factor, htc->ampdu_density); p += scnprintf(p, bufsz + buf - p, "MCS mask:"); for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) p += scnprintf(p, bufsz + buf - p, " %.2x", htc->mcs.rx_mask[i]); p += scnprintf(p, bufsz + buf - p, "\n"); /* If not set this is meaningless */ if (le16_to_cpu(htc->mcs.rx_highest)) { p += scnprintf(p, bufsz + buf - p, "MCS rx highest: %d Mbps\n", le16_to_cpu(htc->mcs.rx_highest)); } p += scnprintf(p, bufsz + buf - p, "MCS tx params: %x\n", htc->mcs.tx_params); } ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(ht_capa); static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap; ssize_t ret; ssize_t bufsz = 512; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsz + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); if (vhtc->vht_supported) { p += scnprintf(p, bufsz + buf - p, "cap: %#.8x\n", vhtc->cap); #define PFLAG(a, b) \ do { \ if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \ p += scnprintf(p, bufsz + buf - p, \ "\t\t%s\n", b); \ } while (0) switch (vhtc->cap & 0x3) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-3895\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-7991\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-11454\n"); break; default: p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-UNKNOWN\n"); } switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case 0: p += scnprintf(p, bufsz + buf - p, "\t\t80Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: p += scnprintf(p, bufsz + buf - p, "\t\t160Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: p += scnprintf(p, bufsz + buf - p, "\t\t80+80Mhz\n"); break; default: p += scnprintf(p, bufsz + buf - p, "\t\tUNKNOWN-MHZ: 0x%x\n", (vhtc->cap >> 2) & 0x3); } PFLAG(RXLDPC, "RXLDPC"); PFLAG(SHORT_GI_80, "SHORT-GI-80"); PFLAG(SHORT_GI_160, "SHORT-GI-160"); PFLAG(TXSTBC, "TXSTBC"); p += scnprintf(p, bufsz + buf - p, "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7); PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE"); PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE"); p += scnprintf(p, bufsz + buf - p, "\t\tBEAMFORMEE-STS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >> IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT); p += scnprintf(p, bufsz + buf - p, "\t\tSOUNDING-DIMENSIONS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK) >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT); PFLAG(MU_BEAMFORMER_CAPABLE, "MU-BEAMFORMER-CAPABLE"); PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE"); PFLAG(VHT_TXOP_PS, "TXOP-PS"); PFLAG(HTC_VHT, "HTC-VHT"); p += scnprintf(p, bufsz + buf - p, "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >> IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT); PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB, "LINK-ADAPTATION-VHT-UNSOL-MFB"); p += scnprintf(p, bufsz + buf - p, "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26); PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN"); PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN"); p += scnprintf(p, bufsz + buf - p, "RX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); if (vhtc->vht_mcs.rx_highest) p += scnprintf(p, bufsz + buf - p, "MCS RX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.rx_highest)); p += scnprintf(p, bufsz + buf - p, "TX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.tx_mcs_map)); if (vhtc->vht_mcs.tx_highest) p += scnprintf(p, bufsz + buf - p, "MCS TX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.tx_highest)); #undef PFLAG } ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(vht_capa); static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap; struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; u8 ppe_size; u8 *cap; int i; ssize_t ret; buf = kmalloc(buf_sz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, buf_sz + buf - p, "HE %ssupported\n", hec->has_he ? "" : "not "); if (!hec->has_he) goto out; cap = hec->he_cap_elem.mac_cap_info; p += scnprintf(p, buf_sz + buf - p, "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]); #define PRINT(fmt, ...) \ p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \ ##__VA_ARGS__) #define PFLAG(t, n, a, b) \ do { \ if (cap[n] & IEEE80211_HE_##t##_CAP##n##_##a) \ PRINT("%s", b); \ } while (0) #define PFLAG_RANGE(t, i, n, s, m, off, fmt) \ do { \ u8 msk = IEEE80211_HE_##t##_CAP##i##_##n##_MASK; \ u8 idx = ((cap[i] & msk) >> (ffs(msk) - 1)) + off; \ PRINT(fmt, (s << idx) + (m * idx)); \ } while (0) #define PFLAG_RANGE_DEFAULT(t, i, n, s, m, off, fmt, a, b) \ do { \ if (cap[i] == IEEE80211_HE_##t ##_CAP##i##_##n##_##a) { \ PRINT("%s", b); \ break; \ } \ PFLAG_RANGE(t, i, n, s, m, off, fmt); \ } while (0) PFLAG(MAC, 0, HTC_HE, "HTC-HE"); PFLAG(MAC, 0, TWT_REQ, "TWT-REQ"); PFLAG(MAC, 0, TWT_RES, "TWT-RES"); PFLAG_RANGE_DEFAULT(MAC, 0, DYNAMIC_FRAG, 0, 1, 0, "DYNAMIC-FRAG-LEVEL-%d", NOT_SUPP, "NOT-SUPP"); PFLAG_RANGE_DEFAULT(MAC, 0, MAX_NUM_FRAG_MSDU, 1, 0, 0, "MAX-NUM-FRAG-MSDU-%d", UNLIMITED, "UNLIMITED"); PFLAG_RANGE_DEFAULT(MAC, 1, MIN_FRAG_SIZE, 128, 0, -1, "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED"); PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0, "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN"); PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1, "MULTI-TID-AGG-RX-QOS-%d"); if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) { switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) { case 0: PRINT("LINK-ADAPTATION-NO-FEEDBACK"); break; case 1: PRINT("LINK-ADAPTATION-RESERVED"); break; case 2: PRINT("LINK-ADAPTATION-UNSOLICITED-FEEDBACK"); break; case 3: PRINT("LINK-ADAPTATION-BOTH"); break; } } PFLAG(MAC, 2, ALL_ACK, "ALL-ACK"); PFLAG(MAC, 2, TRS, "TRS"); PFLAG(MAC, 2, BSR, "BSR"); PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT"); PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP"); PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING"); PFLAG(MAC, 2, ACK_EN, "ACK-EN"); PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL"); PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA"); switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) { case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0: PRINT("MAX-AMPDU-LEN-EXP-USE-EXT-0"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-1"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-2"); break; case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3: PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-3"); break; } PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG"); PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED"); PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS"); PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG"); PFLAG(MAC, 4, QTP, "QTP"); PFLAG(MAC, 4, BQR, "BQR"); PFLAG(MAC, 4, PSR_RESP, "PSR-RESP"); PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); PFLAG(MAC, 4, OPS, "OPS"); PFLAG(MAC, 4, AMSDU_IN_AMPDU, "AMSDU-IN-AMPDU"); PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); PFLAG(MAC, 5, SUBCHAN_SELECTIVE_TRANSMISSION, "SUBCHAN-SELECTIVE-TRANSMISSION"); PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU"); PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX"); PFLAG(MAC, 5, HE_DYNAMIC_SM_PS, "HE-DYNAMIC-SM-PS"); PFLAG(MAC, 5, PUNCTURED_SOUNDING, "PUNCTURED-SOUNDING"); PFLAG(MAC, 5, HT_VHT_TRIG_FRAME_RX, "HT-VHT-TRIG-FRAME-RX"); cap = hec->he_cap_elem.phy_cap_info; p += scnprintf(p, buf_sz + buf - p, "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6], cap[7], cap[8], cap[9], cap[10]); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G, "CHANNEL-WIDTH-SET-40MHZ-IN-2G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G, "CHANNEL-WIDTH-SET-40MHZ-80MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_160MHZ_IN_5G, "CHANNEL-WIDTH-SET-160MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, "CHANNEL-WIDTH-SET-80PLUS80-MHZ-IN-5G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G, "CHANNEL-WIDTH-SET-RU-MAPPING-IN-2G"); PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G, "CHANNEL-WIDTH-SET-RU-MAPPING-IN-5G"); switch (cap[1] & IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK) { case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ: PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-20MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ: PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-40MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ: PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-20MHZ"); break; case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ: PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-40MHZ"); break; } PFLAG(PHY, 1, DEVICE_CLASS_A, "IEEE80211-HE-PHY-CAP1-DEVICE-CLASS-A"); PFLAG(PHY, 1, LDPC_CODING_IN_PAYLOAD, "LDPC-CODING-IN-PAYLOAD"); PFLAG(PHY, 1, HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US, "HY-CAP1-HE-LTF-AND-GI-FOR-HE-PPDUS-0-8US"); PRINT("MIDAMBLE-RX-MAX-NSTS-%d", ((cap[2] << 1) | (cap[1] >> 7)) & 0x3); PFLAG(PHY, 2, NDP_4x_LTF_AND_3_2US, "NDP-4X-LTF-AND-3-2US"); PFLAG(PHY, 2, STBC_TX_UNDER_80MHZ, "STBC-TX-UNDER-80MHZ"); PFLAG(PHY, 2, STBC_RX_UNDER_80MHZ, "STBC-RX-UNDER-80MHZ"); PFLAG(PHY, 2, DOPPLER_TX, "DOPPLER-TX"); PFLAG(PHY, 2, DOPPLER_RX, "DOPPLER-RX"); PFLAG(PHY, 2, UL_MU_FULL_MU_MIMO, "UL-MU-FULL-MU-MIMO"); PFLAG(PHY, 2, UL_MU_PARTIAL_MU_MIMO, "UL-MU-PARTIAL-MU-MIMO"); switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK) { case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM: PRINT("DCM-MAX-CONST-TX-NO-DCM"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK: PRINT("DCM-MAX-CONST-TX-BPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK: PRINT("DCM-MAX-CONST-TX-QPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM: PRINT("DCM-MAX-CONST-TX-16-QAM"); break; } PFLAG(PHY, 3, DCM_MAX_TX_NSS_1, "DCM-MAX-TX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_TX_NSS_2, "DCM-MAX-TX-NSS-2"); switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK) { case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM: PRINT("DCM-MAX-CONST-RX-NO-DCM"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK: PRINT("DCM-MAX-CONST-RX-BPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK: PRINT("DCM-MAX-CONST-RX-QPSK"); break; case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM: PRINT("DCM-MAX-CONST-RX-16-QAM"); break; } PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2"); PFLAG(PHY, 3, RX_PARTIAL_BW_SU_IN_20MHZ_MU, "RX-PARTIAL-BW-SU-IN-20MHZ-MU"); PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE"); PFLAG(PHY, 4, MU_BEAMFORMER, "MU-BEAMFORMER"); PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_UNDER_80MHZ, 0, 1, 4, "BEAMFORMEE-MAX-STS-UNDER-%d"); PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_ABOVE_80MHZ, 0, 1, 4, "BEAMFORMEE-MAX-STS-ABOVE-%d"); PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ, 0, 1, 1, "NUM-SND-DIM-UNDER-80MHZ-%d"); PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ, 0, 1, 1, "NUM-SND-DIM-ABOVE-80MHZ-%d"); PFLAG(PHY, 5, NG16_SU_FEEDBACK, "NG16-SU-FEEDBACK"); PFLAG(PHY, 5, NG16_MU_FEEDBACK, "NG16-MU-FEEDBACK"); PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU"); PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU"); PFLAG(PHY, 6, TRIG_SU_BEAMFORMING_FB, "TRIG-SU-BEAMFORMING-FB"); PFLAG(PHY, 6, TRIG_MU_BEAMFORMING_PARTIAL_BW_FB, "MU-BEAMFORMING-PARTIAL-BW-FB"); PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB"); PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE"); PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO, "PARTIAL-BANDWIDTH-DL-MUMIMO"); PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT"); PFLAG(PHY, 7, PSR_BASED_SR, "PSR-BASED-SR"); PFLAG(PHY, 7, POWER_BOOST_FACTOR_SUPP, "POWER-BOOST-FACTOR-SUPP"); PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI, "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI"); PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d"); PFLAG(PHY, 7, STBC_TX_ABOVE_80MHZ, "STBC-TX-ABOVE-80MHZ"); PFLAG(PHY, 7, STBC_RX_ABOVE_80MHZ, "STBC-RX-ABOVE-80MHZ"); PFLAG(PHY, 8, HE_ER_SU_PPDU_4XLTF_AND_08_US_GI, "HE-ER-SU-PPDU-4XLTF-AND-08-US-GI"); PFLAG(PHY, 8, 20MHZ_IN_40MHZ_HE_PPDU_IN_2G, "20MHZ-IN-40MHZ-HE-PPDU-IN-2G"); PFLAG(PHY, 8, 20MHZ_IN_160MHZ_HE_PPDU, "20MHZ-IN-160MHZ-HE-PPDU"); PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU"); PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI, "HE-ER-SU-1XLTF-AND-08-US-GI"); PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF, "MIDAMBLE-RX-TX-2X-AND-1XLTF"); switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK) { case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242: PRINT("DCM-MAX-RU-242"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484: PRINT("DCM-MAX-RU-484"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996: PRINT("DCM-MAX-RU-996"); break; case IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996: PRINT("DCM-MAX-RU-2x996"); break; } PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM, "LONGER-THAN-16-SIGB-OFDM-SYM"); PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK, "NON-TRIGGERED-CQI-FEEDBACK"); PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU, "TX-1024-QAM-LESS-THAN-242-TONE-RU"); PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU, "RX-1024-QAM-LESS-THAN-242-TONE-RU"); PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB, "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB"); PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB, "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB"); switch (u8_get_bits(cap[9], IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK)) { case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US: PRINT("NOMINAL-PACKET-PADDING-0US"); break; case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US: PRINT("NOMINAL-PACKET-PADDING-8US"); break; case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US: PRINT("NOMINAL-PACKET-PADDING-16US"); break; } #undef PFLAG_RANGE_DEFAULT #undef PFLAG_RANGE #undef PFLAG #define PRINT_NSS_SUPP(f, n) \ do { \ int _i; \ u16 v = le16_to_cpu(nss->f); \ p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \ for (_i = 0; _i < 8; _i += 2) { \ switch ((v >> _i) & 0x3) { \ case 0: \ PRINT(n "-%d-SUPPORT-0-7", _i / 2); \ break; \ case 1: \ PRINT(n "-%d-SUPPORT-0-9", _i / 2); \ break; \ case 2: \ PRINT(n "-%d-SUPPORT-0-11", _i / 2); \ break; \ case 3: \ PRINT(n "-%d-NOT-SUPPORTED", _i / 2); \ break; \ } \ } \ } while (0) PRINT_NSS_SUPP(rx_mcs_80, "RX-MCS-80"); PRINT_NSS_SUPP(tx_mcs_80, "TX-MCS-80"); if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) { PRINT_NSS_SUPP(rx_mcs_160, "RX-MCS-160"); PRINT_NSS_SUPP(tx_mcs_160, "TX-MCS-160"); } if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) { PRINT_NSS_SUPP(rx_mcs_80p80, "RX-MCS-80P80"); PRINT_NSS_SUPP(tx_mcs_80p80, "TX-MCS-80P80"); } #undef PRINT_NSS_SUPP #undef PRINT if (!(cap[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)) goto out; p += scnprintf(p, buf_sz + buf - p, "PPE-THRESHOLDS: %#.2x", hec->ppe_thres[0]); ppe_size = ieee80211_he_ppe_size(hec->ppe_thres[0], cap); for (i = 1; i < ppe_size; i++) { p += scnprintf(p, buf_sz + buf - p, " %#.2x", hec->ppe_thres[i]); } p += scnprintf(p, buf_sz + buf - p, "\n"); out: ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(he_capa); static ssize_t link_sta_eht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; struct link_sta_info *link_sta = file->private_data; struct ieee80211_sta_eht_cap *bec = &link_sta->pub->eht_cap; struct ieee80211_eht_cap_elem_fixed *fixed = &bec->eht_cap_elem; struct ieee80211_eht_mcs_nss_supp *nss = &bec->eht_mcs_nss_supp; u8 *cap; int i; ssize_t ret; static const char *mcs_desc[] = { "0-7", "8-9", "10-11", "12-13"}; buf = kmalloc(buf_sz, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, buf_sz + buf - p, "EHT %ssupported\n", bec->has_eht ? "" : "not "); if (!bec->has_eht) goto out; p += scnprintf(p, buf_sz + buf - p, "MAC-CAP: %#.2x %#.2x\n", fixed->mac_cap_info[0], fixed->mac_cap_info[1]); p += scnprintf(p, buf_sz + buf - p, "PHY-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n", fixed->phy_cap_info[0], fixed->phy_cap_info[1], fixed->phy_cap_info[2], fixed->phy_cap_info[3], fixed->phy_cap_info[4], fixed->phy_cap_info[5], fixed->phy_cap_info[6], fixed->phy_cap_info[7], fixed->phy_cap_info[8]); #define PRINT(fmt, ...) \ p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \ ##__VA_ARGS__) #define PFLAG(t, n, a, b) \ do { \ if (cap[n] & IEEE80211_EHT_##t##_CAP##n##_##a) \ PRINT("%s", b); \ } while (0) cap = fixed->mac_cap_info; PFLAG(MAC, 0, EPCS_PRIO_ACCESS, "EPCS-PRIO-ACCESS"); PFLAG(MAC, 0, OM_CONTROL, "OM-CONTROL"); PFLAG(MAC, 0, TRIG_TXOP_SHARING_MODE1, "TRIG-TXOP-SHARING-MODE1"); PFLAG(MAC, 0, TRIG_TXOP_SHARING_MODE2, "TRIG-TXOP-SHARING-MODE2"); PFLAG(MAC, 0, RESTRICTED_TWT, "RESTRICTED-TWT"); PFLAG(MAC, 0, SCS_TRAFFIC_DESC, "SCS-TRAFFIC-DESC"); switch ((cap[0] & 0xc0) >> 6) { case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895: PRINT("MAX-MPDU-LEN: 3985"); break; case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991: PRINT("MAX-MPDU-LEN: 7991"); break; case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454: PRINT("MAX-MPDU-LEN: 11454"); break; } cap = fixed->phy_cap_info; PFLAG(PHY, 0, 320MHZ_IN_6GHZ, "320MHZ-IN-6GHZ"); PFLAG(PHY, 0, 242_TONE_RU_GT20MHZ, "242-TONE-RU-GT20MHZ"); PFLAG(PHY, 0, NDP_4_EHT_LFT_32_GI, "NDP-4-EHT-LFT-32-GI"); PFLAG(PHY, 0, PARTIAL_BW_UL_MU_MIMO, "PARTIAL-BW-UL-MU-MIMO"); PFLAG(PHY, 0, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 0, SU_BEAMFORMEE, "SU-BEAMFORMEE"); i = cap[0] >> 7; i |= (cap[1] & 0x3) << 1; PRINT("BEAMFORMEE-80-NSS: %i", i); PRINT("BEAMFORMEE-160-NSS: %i", (cap[1] >> 2) & 0x7); PRINT("BEAMFORMEE-320-NSS: %i", (cap[1] >> 5) & 0x7); PRINT("SOUNDING-DIM-80-NSS: %i", (cap[2] & 0x7)); PRINT("SOUNDING-DIM-160-NSS: %i", (cap[2] >> 3) & 0x7); i = cap[2] >> 6; i |= (cap[3] & 0x1) << 3; PRINT("SOUNDING-DIM-320-NSS: %i", i); PFLAG(PHY, 3, NG_16_SU_FEEDBACK, "NG-16-SU-FEEDBACK"); PFLAG(PHY, 3, NG_16_MU_FEEDBACK, "NG-16-MU-FEEDBACK"); PFLAG(PHY, 3, CODEBOOK_4_2_SU_FDBK, "CODEBOOK-4-2-SU-FDBK"); PFLAG(PHY, 3, CODEBOOK_7_5_MU_FDBK, "CODEBOOK-7-5-MU-FDBK"); PFLAG(PHY, 3, TRIG_SU_BF_FDBK, "TRIG-SU-BF-FDBK"); PFLAG(PHY, 3, TRIG_MU_BF_PART_BW_FDBK, "TRIG-MU-BF-PART-BW-FDBK"); PFLAG(PHY, 3, TRIG_CQI_FDBK, "TRIG-CQI-FDBK"); PFLAG(PHY, 4, PART_BW_DL_MU_MIMO, "PART-BW-DL-MU-MIMO"); PFLAG(PHY, 4, PSR_SR_SUPP, "PSR-SR-SUPP"); PFLAG(PHY, 4, POWER_BOOST_FACT_SUPP, "POWER-BOOST-FACT-SUPP"); PFLAG(PHY, 4, EHT_MU_PPDU_4_EHT_LTF_08_GI, "EHT-MU-PPDU-4-EHT-LTF-08-GI"); PRINT("MAX_NC: %i", cap[4] >> 4); PFLAG(PHY, 5, NON_TRIG_CQI_FEEDBACK, "NON-TRIG-CQI-FEEDBACK"); PFLAG(PHY, 5, TX_LESS_242_TONE_RU_SUPP, "TX-LESS-242-TONE-RU-SUPP"); PFLAG(PHY, 5, RX_LESS_242_TONE_RU_SUPP, "RX-LESS-242-TONE-RU-SUPP"); PFLAG(PHY, 5, PPE_THRESHOLD_PRESENT, "PPE_THRESHOLD_PRESENT"); switch (cap[5] >> 4 & 0x3) { case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US: PRINT("NOMINAL_PKT_PAD: 0us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US: PRINT("NOMINAL_PKT_PAD: 8us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US: PRINT("NOMINAL_PKT_PAD: 16us"); break; case IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US: PRINT("NOMINAL_PKT_PAD: 20us"); break; } i = cap[5] >> 6; i |= cap[6] & 0x7; PRINT("MAX-NUM-SUPP-EHT-LTF: %i", i); PFLAG(PHY, 5, SUPP_EXTRA_EHT_LTF, "SUPP-EXTRA-EHT-LTF"); i = (cap[6] >> 3) & 0xf; PRINT("MCS15-SUPP-MASK: %i", i); PFLAG(PHY, 6, EHT_DUP_6GHZ_SUPP, "EHT-DUP-6GHZ-SUPP"); PFLAG(PHY, 7, 20MHZ_STA_RX_NDP_WIDER_BW, "20MHZ-STA-RX-NDP-WIDER-BW"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_80MHZ, "NON-OFDMA-UL-MU-MIMO-80MHZ"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_160MHZ, "NON-OFDMA-UL-MU-MIMO-160MHZ"); PFLAG(PHY, 7, NON_OFDMA_UL_MU_MIMO_320MHZ, "NON-OFDMA-UL-MU-MIMO-320MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_80MHZ, "MU-BEAMFORMER-80MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_160MHZ, "MU-BEAMFORMER-160MHZ"); PFLAG(PHY, 7, MU_BEAMFORMER_320MHZ, "MU-BEAMFORMER-320MHZ"); PFLAG(PHY, 7, TB_SOUNDING_FDBK_RATE_LIMIT, "TB-SOUNDING-FDBK-RATE-LIMIT"); PFLAG(PHY, 8, RX_1024QAM_WIDER_BW_DL_OFDMA, "RX-1024QAM-WIDER-BW-DL-OFDMA"); PFLAG(PHY, 8, RX_4096QAM_WIDER_BW_DL_OFDMA, "RX-4096QAM-WIDER-BW-DL-OFDMA"); #undef PFLAG PRINT(""); /* newline */ if (!(link_sta->pub->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) { u8 *mcs_vals = (u8 *)(&nss->only_20mhz); for (i = 0; i < 4; i++) PRINT("EHT bw=20 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); } else { u8 *mcs_vals = (u8 *)(&nss->bw._80); for (i = 0; i < 3; i++) PRINT("EHT bw <= 80 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); mcs_vals = (u8 *)(&nss->bw._160); for (i = 0; i < 3; i++) PRINT("EHT bw <= 160 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); mcs_vals = (u8 *)(&nss->bw._320); for (i = 0; i < 3; i++) PRINT("EHT bw <= 320 MHz, max NSS for MCS %s: Rx=%u, Tx=%u", mcs_desc[i + 1], mcs_vals[i] & 0xf, mcs_vals[i] >> 4); } if (cap[5] & IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { u8 ppe_size = ieee80211_eht_ppe_size(bec->eht_ppe_thres[0], cap); p += scnprintf(p, buf_sz + buf - p, "EHT PPE Thresholds: "); for (i = 0; i < ppe_size; i++) p += scnprintf(p, buf_sz + buf - p, "0x%02x ", bec->eht_ppe_thres[i]); PRINT(""); /* newline */ } out: ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return ret; } LINK_STA_OPS(eht_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ sta->debugfs_dir, sta, &sta_ ##name## _ops) #define DEBUGFS_ADD_COUNTER(name, field) \ debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field); void ieee80211_sta_debugfs_add(struct sta_info *sta) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; u8 mac[3*ETH_ALEN]; if (!stations_dir) return; snprintf(mac, sizeof(mac), "%pM", sta->sta.addr); /* * This might fail due to a race condition: * When mac80211 unlinks a station, the debugfs entries * remain, but it is already possible to link a new * station with the same address which triggers adding * it to debugfs; therefore, if the old station isn't * destroyed quickly enough the old station's debugfs * dir might still be around. */ sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); DEBUGFS_ADD(flags); DEBUGFS_ADD(aid); DEBUGFS_ADD(num_ps_buf_frames); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); /* FIXME: Kept here as the statistics are only done on the deflink */ DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); DEBUGFS_ADD(aqm); DEBUGFS_ADD(airtime); if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) DEBUGFS_ADD(aql); debugfs_create_xul("driver_buffered_tids", 0400, sta->debugfs_dir, &sta->driver_buffered_tids); drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); } void ieee80211_sta_debugfs_remove(struct sta_info *sta) { debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } #undef DEBUGFS_ADD #undef DEBUGFS_ADD_COUNTER #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops) #define DEBUGFS_ADD_COUNTER(name, field) \ debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field) void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) { if (WARN_ON(!link_sta->sta->debugfs_dir)) return; /* For non-MLO, leave the files in the main directory. */ if (link_sta->sta->sta.valid_links) { char link_dir_name[10]; snprintf(link_dir_name, sizeof(link_dir_name), "link-%d", link_sta->link_id); link_sta->debugfs_dir = debugfs_create_dir(link_dir_name, link_sta->sta->debugfs_dir); DEBUGFS_ADD(addr); } else { if (WARN_ON(link_sta != &link_sta->sta->deflink)) return; link_sta->debugfs_dir = link_sta->sta->debugfs_dir; } DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(he_capa); DEBUGFS_ADD(eht_capa); DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); } void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) { if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) { link_sta->debugfs_dir = NULL; return; } if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) { WARN_ON(link_sta != &link_sta->sta->deflink); link_sta->sta->debugfs_dir = NULL; return; } debugfs_remove_recursive(link_sta->debugfs_dir); link_sta->debugfs_dir = NULL; } void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) { if (WARN_ON(!link_sta->debugfs_dir)) return; drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata, link_sta->pub, link_sta->debugfs_dir); } void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) { if (!link_sta->debugfs_dir) return; if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir)) return; /* Recreate the directory excluding the driver data */ debugfs_remove_recursive(link_sta->debugfs_dir); link_sta->debugfs_dir = NULL; ieee80211_link_sta_debugfs_add(link_sta); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | #ifndef LLC_CONN_H #define LLC_CONN_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001, 2002 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/timer.h> #include <net/llc_if.h> #include <net/sock.h> #include <linux/llc.h> #define LLC_EVENT 1 #define LLC_PACKET 2 #define LLC2_P_TIME 2 #define LLC2_ACK_TIME 1 #define LLC2_REJ_TIME 3 #define LLC2_BUSY_TIME 3 struct llc_timer { struct timer_list timer; unsigned long expire; /* timer expire time */ }; struct llc_sock { /* struct sock must be the first member of llc_sock */ struct sock sk; struct sockaddr_llc addr; /* address sock is bound to */ u8 state; /* state of connection */ struct llc_sap *sap; /* pointer to parent SAP */ struct llc_addr laddr; /* lsap/mac pair */ struct llc_addr daddr; /* dsap/mac pair */ struct net_device *dev; /* device to send to remote */ netdevice_tracker dev_tracker; u32 copied_seq; /* head of yet unread data */ u8 retry_count; /* number of retries */ u8 ack_must_be_send; u8 first_pdu_Ns; u8 npta; struct llc_timer ack_timer; struct llc_timer pf_cycle_timer; struct llc_timer rej_sent_timer; struct llc_timer busy_state_timer; /* ind busy clr at remote LLC */ u8 vS; /* seq# next in-seq I-PDU tx'd*/ u8 vR; /* seq# next in-seq I-PDU rx'd*/ u32 n2; /* max nbr re-tx's for timeout*/ u32 n1; /* max nbr octets in I PDU */ u8 k; /* tx window size; max = 127 */ u8 rw; /* rx window size; max = 127 */ u8 p_flag; /* state flags */ u8 f_flag; u8 s_flag; u8 data_flag; u8 remote_busy_flag; u8 cause_flag; struct sk_buff_head pdu_unack_q; /* PUDs sent/waiting ack */ u16 link; /* network layer link number */ u8 X; /* a temporary variable */ u8 ack_pf; /* this flag indicates what is the P-bit of acknowledge */ u8 failed_data_req; /* recognize that already exist a failed llc_data_req_handler (tx_buffer_full or unacceptable state */ u8 dec_step; u8 inc_cntr; u8 dec_cntr; u8 connect_step; u8 last_nr; /* NR of last pdu received */ u32 rx_pdu_hdr; /* used for saving header of last pdu received and caused sending FRMR. Used for resending FRMR */ u32 cmsg_flags; struct hlist_node dev_hash_node; }; static inline struct llc_sock *llc_sk(const struct sock *sk) { return (struct llc_sock *)sk; } static __inline__ void llc_set_backlog_type(struct sk_buff *skb, char type) { skb->cb[sizeof(skb->cb) - 1] = type; } static __inline__ char llc_backlog_type(struct sk_buff *skb) { return skb->cb[sizeof(skb->cb) - 1]; } struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void llc_sk_stop_all_timers(struct sock *sk, bool sync); void llc_sk_free(struct sock *sk); void llc_sk_reset(struct sock *sk); /* Access to a connection */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb); void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); int llc_conn_remove_acked_pdus(struct sock *conn, u8 nr, u16 *how_many_unacked); struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net); void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk); void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk); u8 llc_data_accept_state(u8 state); void llc_build_offset_table(void); #endif /* LLC_CONN_H */ |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */ #include <crypto/aead.h> #include <linux/debugfs.h> #include <net/xfrm.h> #include "netdevsim.h" #define NSIM_IPSEC_AUTH_BITS 128 static ssize_t nsim_dbg_netdev_ops_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct netdevsim *ns = filp->private_data; struct nsim_ipsec *ipsec = &ns->ipsec; size_t bufsize; char *buf, *p; int len; int i; /* the buffer needed is * (num SAs * 3 lines each * ~60 bytes per line) + one more line */ bufsize = (ipsec->count * 4 * 60) + 60; buf = kzalloc(bufsize, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsize - (p - buf), "SA count=%u tx=%u\n", ipsec->count, ipsec->tx); for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { struct nsim_sa *sap = &ipsec->sa[i]; if (!sap->used) continue; p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n", i, (sap->rx ? 'r' : 't'), sap->ipaddr[0], sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]); p += scnprintf(p, bufsize - (p - buf), "sa[%i] spi=0x%08x proto=0x%x salt=0x%08x crypt=%d\n", i, be32_to_cpu(sap->xs->id.spi), sap->xs->id.proto, sap->salt, sap->crypt); p += scnprintf(p, bufsize - (p - buf), "sa[%i] key=0x%08x %08x %08x %08x\n", i, sap->key[0], sap->key[1], sap->key[2], sap->key[3]); } len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf); kfree(buf); return len; } static const struct file_operations ipsec_dbg_fops = { .owner = THIS_MODULE, .open = simple_open, .read = nsim_dbg_netdev_ops_read, }; static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec) { u32 i; if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT) return -ENOSPC; /* search sa table */ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { if (!ipsec->sa[i].used) return i; } return -ENOSPC; } static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { const char aes_gcm_name[] = "rfc4106(gcm(aes))"; struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; if (!xs->aead) { netdev_err(dev, "Unsupported IPsec algorithm\n"); return -EINVAL; } if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) { netdev_err(dev, "IPsec offload requires %d bit authentication\n", NSIM_IPSEC_AUTH_BITS); return -EINVAL; } key_data = &xs->aead->alg_key[0]; key_len = xs->aead->alg_key_len; alg_name = xs->aead->alg_name; if (strcmp(alg_name, aes_gcm_name)) { netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n", aes_gcm_name); return -EINVAL; } /* 160 accounts for 16 byte key and 4 byte salt */ if (key_len > NSIM_IPSEC_AUTH_BITS) { *mysalt = ((u32 *)key_data)[4]; } else if (key_len == NSIM_IPSEC_AUTH_BITS) { *mysalt = 0; } else { netdev_err(dev, "IPsec hw offload only supports 128 bit keys with optional 32 bit salt\n"); return -EINVAL; } memcpy(mykey, key_data, 16); return 0; } static int nsim_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct nsim_ipsec *ipsec; struct net_device *dev; struct netdevsim *ns; struct nsim_sa sa; u16 sa_idx; int ret; dev = xs->xso.real_dev; ns = netdev_priv(dev); ipsec = &ns->ipsec; if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol for ipsec offload"); return -EINVAL; } if (xs->calg) { NL_SET_ERR_MSG_MOD(extack, "Compression offload not supported"); return -EINVAL; } if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ipsec offload type"); return -EINVAL; } /* find the first unused index */ ret = nsim_ipsec_find_empty_idx(ipsec); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "No space for SA in Rx table!"); return ret; } sa_idx = (u16)ret; memset(&sa, 0, sizeof(sa)); sa.used = true; sa.xs = xs; if (sa.xs->id.proto & IPPROTO_ESP) sa.crypt = xs->ealg || xs->aead; /* get the key and salt */ ret = nsim_ipsec_parse_proto_keys(xs, sa.key, &sa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for SA table"); return ret; } if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) { sa.rx = true; if (xs->props.family == AF_INET6) memcpy(sa.ipaddr, &xs->id.daddr.a6, 16); else memcpy(&sa.ipaddr[3], &xs->id.daddr.a4, 4); } /* the preparations worked, so save the info */ memcpy(&ipsec->sa[sa_idx], &sa, sizeof(sa)); /* the XFRM stack doesn't like offload_handle == 0, * so add a bitflag in case our array index is 0 */ xs->xso.offload_handle = sa_idx | NSIM_IPSEC_VALID; ipsec->count++; return 0; } static void nsim_ipsec_del_sa(struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; u16 sa_idx; sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (!ipsec->sa[sa_idx].used) { netdev_err(ns->netdev, "Invalid SA for delete sa_idx=%d\n", sa_idx); return; } memset(&ipsec->sa[sa_idx], 0, sizeof(struct nsim_sa)); ipsec->count--; } static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; ipsec->ok++; return true; } static const struct xfrmdev_ops nsim_xfrmdev_ops = { .xdo_dev_state_add = nsim_ipsec_add_sa, .xdo_dev_state_delete = nsim_ipsec_del_sa, .xdo_dev_offload_ok = nsim_ipsec_offload_ok, }; bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb) { struct sec_path *sp = skb_sec_path(skb); struct nsim_ipsec *ipsec = &ns->ipsec; struct xfrm_state *xs; struct nsim_sa *tsa; u32 sa_idx; /* do we even need to check this packet? */ if (!sp) return true; if (unlikely(!sp->len)) { netdev_err(ns->netdev, "no xfrm state len = %d\n", sp->len); return false; } xs = xfrm_input_state(skb); if (unlikely(!xs)) { netdev_err(ns->netdev, "no xfrm_input_state() xs = %p\n", xs); return false; } sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) { netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n", sa_idx, NSIM_IPSEC_MAX_SA_COUNT); return false; } tsa = &ipsec->sa[sa_idx]; if (unlikely(!tsa->used)) { netdev_err(ns->netdev, "unused sa_idx=%d\n", sa_idx); return false; } if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { netdev_err(ns->netdev, "unexpected proto=%d\n", xs->id.proto); return false; } ipsec->tx++; return true; } void nsim_ipsec_init(struct netdevsim *ns) { ns->netdev->xfrmdev_ops = &nsim_xfrmdev_ops; #define NSIM_ESP_FEATURES (NETIF_F_HW_ESP | \ NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) ns->netdev->features |= NSIM_ESP_FEATURES; ns->netdev->hw_enc_features |= NSIM_ESP_FEATURES; ns->ipsec.pfile = debugfs_create_file("ipsec", 0400, ns->nsim_dev_port->ddir, ns, &ipsec_dbg_fops); } void nsim_ipsec_teardown(struct netdevsim *ns) { struct nsim_ipsec *ipsec = &ns->ipsec; if (ipsec->count) netdev_err(ns->netdev, "tearing down IPsec offload with %d SAs left\n", ipsec->count); debugfs_remove_recursive(ipsec->pfile); } |
| 4 33 52 52 52 52 26 98 2 96 4 4 98 4 1 58 10 10 10 10 10 10 10 507 507 2 2 2 2 349 20 1 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lwtunnel Infrastructure for light weight tunnels like mpls * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/lwtunnel.h> #include <linux/in.h> #include <linux/init.h> #include <linux/err.h> #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> #include <net/rtnh.h> DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) { /* Only lwt encaps implemented without using an interface for * the encap need to return a string here. */ switch (encap_type) { case LWTUNNEL_ENCAP_MPLS: return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; case LWTUNNEL_ENCAP_XFRM: /* module autoload not supported for encap type */ return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ WARN_ON(1); break; } return NULL; } #endif /* CONFIG_MODULES */ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); return lws; } EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, unsigned int num) { if (num > LWTUNNEL_ENCAP_MAX) return -ERANGE; return !cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) { int ret; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) return -ERANGE; ret = (cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[encap_type], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG_ATTR(extack, encap, "Unknown LWT encapsulation type"); return ret; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; rcu_read_unlock(); if (found) { ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { /* don't rely on -EOPNOTSUPP to detect match as build_state * handlers could return it */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } return ret; } EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); rtnl_lock(); rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); } } #endif ret = ops ? 0 : -EOPNOTSUPP; if (ret < 0) NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; u16 encap_type; int attrlen; while (rtnh_ok(rtnh, remaining)) { attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { if (nla_len(nla_entype) < sizeof(u16)) { NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); return -EINVAL; } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, extack) != 0) return -EOPNOTSUPP; } } rtnh = rtnh_next(rtnh, &remaining); } return 0; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; if (ops->destroy_state) { ops->destroy_state(lws); kfree_rcu(lws, rcu); } else { kfree(lws); } module_put(ops->owner); } EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; int ret; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) ret = ops->fill_encap(skb, lwtstate); rcu_read_unlock(); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; return 0; nla_put_failure: nla_nest_cancel(skb, nest); return (ret == -EOPNOTSUPP ? 0 : ret); } EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->get_encap_size)) ret = nla_total_size(ops->get_encap_size(lwtstate)); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!a && !b) return 0; if (!a || !b) return 1; if (a->type != b->type) return 1; if (a->type == LWTUNNEL_ENCAP_NONE || a->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[a->type]); if (likely(ops && ops->cmp_encap)) ret = ops->cmp_encap(a, b); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) ret = ops->output(net, sk, skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) ret = ops->xmit(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) ret = ops->input(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_input); |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } u8 cfsrvl_getphyid(struct cflayer *layer) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt); |
| 1566 2433 963 246 172 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { /* if parent exists, it's always a dir; otherwise, @sd is a dir */ if (kn->parent) kn = kn->parent; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } static inline void kernfs_set_rev(struct kernfs_node *parent, struct dentry *dentry) { dentry->d_time = parent->dir.rev; } static inline void kernfs_inc_rev(struct kernfs_node *parent) { parent->dir.rev++; } static inline bool kernfs_dir_changed(struct kernfs_node *parent, struct dentry *dentry) { if (parent->dir.rev != dentry->d_time) return true; return false; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; /* * kernfs locks */ extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */ |
| 18 4 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2014 Nicira, Inc. */ #ifndef DATAPATH_H #define DATAPATH_H 1 #include <asm/page.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> #include <net/ip_tunnels.h> #include "conntrack.h" #include "flow.h" #include "flow_table.h" #include "meter.h" #include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 #define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. * @n_hit: Number of received packets for which a matching flow was found in * the flow table. * @n_miss: Number of received packets that had no matching flow in the flow * table. The sum of @n_hit and @n_miss is the number of packets that have * been received by the datapath. * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in * one of the datapath's queues). * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. * @n_cache_hit: The number of received packets that had their mask found using * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; u64 n_cache_hit; struct u64_stats_sync syncp; }; /** * struct dp_nlsk_pids - array of netlink portids of for a datapath. * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU * is enabled and must be protected by rcu. * @rcu: RCU callback head for deferred destruction. * @n_pids: Size of @pids array. * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets * that miss the flow table. */ struct dp_nlsk_pids { struct rcu_head rcu; u32 n_pids; u32 pids[]; }; /** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. * @table: flow table. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. */ struct datapath { struct rcu_head rcu; struct list_head list_node; /* Flow table. */ struct flow_table table; /* Switch ports. */ struct hlist_head *ports; /* Stats. */ struct dp_stats_percpu __percpu *stats_percpu; /* Network namespace ref. */ possible_net_t net; u32 user_features; u32 max_headroom; /* Switch meters. */ struct dp_meter_table meter_tbl; struct dp_nlsk_pids __rcu *upcall_portids; }; /** * struct ovs_skb_cb - OVS data in skb CB * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. * @acts_origlen: The netlink size of the flow actions applied to this skb. * @cutlen: The number of bytes from the packet end to be removed. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; u16 acts_origlen; u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) /** * struct dp_upcall - metadata to include with a packet to send to userspace * @cmd: One of %OVS_PACKET_CMD_*. * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. * @portid: Netlink portid to which packet should be sent. If @portid is 0 * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; u32 portid; u8 cmd; u16 mru; }; /** * struct ovs_net - Per net-namespace data for ovs. * @dps: List of datapaths to enable dumping them all out. * Protected by genl_mutex. */ struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; struct delayed_work masks_rebalance; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif /* Module reference for configuring conntrack. */ bool xt_label; }; /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash * over transport ports. */ enum ovs_pkt_hash_types { OVS_PACKET_HASH_SW_BIT = (1ULL << 32), OVS_PACKET_HASH_L4_BIT = (1ULL << 33), }; extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); #ifdef CONFIG_LOCKDEP int lockdep_ovsl_is_held(void); #else #define lockdep_ovsl_is_held() 1 #endif #define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) #define rcu_dereference_ovsl(p) \ rcu_dereference_check(p, lockdep_ovsl_is_held()) static inline struct net *ovs_dp_get_net(const struct datapath *dp) { return read_pnet(&dp->net); } static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) { write_pnet(&dp->net, net); } struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) { WARN_ON_ONCE(!rcu_read_lock_held()); return ovs_lookup_vport(dp, port_no); } static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) { WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); return ovs_lookup_vport(dp, port_no); } static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) { ASSERT_OVSL(); return ovs_lookup_vport(dp, port_no); } /* Must be called with rcu_read_lock. */ static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) return vport->dp; } return NULL; } /* The caller must hold either ovs_mutex or rcu_read_lock to keep the * returned dp pointer valid. */ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) { struct datapath *dp; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); rcu_read_lock(); dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; } extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *, struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ } while (0) #endif /* datapath.h */ |
| 6 18 20 22 23 14 2 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H #include <linux/bpf.h> #include <linux/filter.h> #include <linux/scatterlist.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp.h> #include <net/strparser.h> #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, __SK_NONE, }; struct sk_msg_sg { u32 start; u32 curr; u32 end; u32 size; u32 copybreak; DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; void *data_end; u32 apply_bytes; u32 cork_bytes; u32 flags; struct sk_buff *skb; struct sock *sk_redir; struct sock *sk; struct list_head list; }; struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { struct list_head list; struct bpf_map *map; void *link_raw; }; struct sk_psock_work_state { u32 len; u32 off; }; struct sk_psock { struct sock *sk; struct sock *sk_redir; u32 apply_bytes; u32 cork_bytes; u32 eval; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); /* psock_update_sk_prot may be called with restore=false many times * so the handler must be safe for this case. It will be called * exactly once with restore=true when the psock is being destroyed * and psock refcnt is zero, but before an RCU grace period. */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; struct sock *sk_pair; struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { WARN_ON(i == msg->sg.end && bytes); } static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) { if (psock->apply_bytes) { if (psock->apply_bytes < bytes) psock->apply_bytes = 0; else psock->apply_bytes -= bytes; } } static inline u32 sk_msg_iter_dist(u32 start, u32 end) { return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) #define sk_msg_iter_var_next(var) \ do { \ var++; \ if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) #define sk_msg_iter_prev(msg, which) \ sk_msg_iter_var_prev(msg->sg.which) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) { memcpy(dst, src, sizeof(*src)); sk_msg_init(src); } static inline bool sk_msg_full(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; } static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) { return msg->sg.data[which]; } static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); } static inline bool sk_msg_to_ingress(const struct sk_msg *msg) { return msg->flags & BPF_F_INGRESS; } static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { msg->data = sg_virt(sge); msg->data_end = msg->data + sge->length; } } static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, u32 len, u32 offset) { struct scatterlist *sge; get_page(page); sge = sk_msg_elem(msg, msg->sg.end); sg_set_page(sge, page, len, offset); sg_unmark_end(sge); __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) __set_bit(i, msg->sg.copy); else __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); } static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, false); } static inline struct sk_psock *sk_psock(const struct sock *sk) { return __rcu_dereference_sk_user_data_with_flags(sk, SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { set_bit(bit, &psock->state); } static inline void sk_psock_clear_state(struct sk_psock *psock, enum sk_psock_state_bits bit) { clear_bit(bit, &psock->state); } static inline bool sk_psock_test_state(const struct sk_psock *psock, enum sk_psock_state_bits bit) { return test_bit(bit, &psock->state); } static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); kfree_skb(skb); } static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); else { sk_msg_free(psock->sk, msg); kfree(msg); } spin_unlock_bh(&psock->ingress_lock); } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); if (msg) list_del(&msg->list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); spin_unlock_bh(&psock->ingress_lock); return msg; } static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, struct sk_msg *msg) { struct sk_msg *ret; spin_lock_bh(&psock->ingress_lock); if (list_is_last(&msg->list, &psock->ingress_msg)) ret = NULL; else ret = list_next_entry(msg, list); spin_unlock_bh(&psock->ingress_lock); return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) { return psock ? list_empty(&psock->ingress_msg) : true; } static inline void kfree_sk_msg(struct sk_msg *msg) { if (msg->skb) consume_skb(msg->skb); kfree(msg); } static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; sk->sk_err = err; sk_error_report(sk); } struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); #else static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { return -EOPNOTSUPP; } static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { } static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { } #endif void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); static inline struct sk_psock_link *sk_psock_init_link(void) { return kzalloc(sizeof(struct sk_psock_link), GFP_ATOMIC | __GFP_NOWARN); } static inline void sk_psock_free_link(struct sk_psock_link *link) { kfree(link); } struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { sk_msg_free(psock->sk, psock->cork); kfree(psock->cork); psock->cork = NULL; } } static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, true); } static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock && !refcount_inc_not_zero(&psock->refcnt)) psock = NULL; rcu_read_unlock(); return psock; } void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) { if (refcount_dec_and_test(&psock->refcnt)) sk_psock_drop(sk, psock); } static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { prog = xchg(pprog, prog); if (prog) bpf_prog_put(prog); } static inline int psock_replace_prog(struct bpf_prog **pprog, struct bpf_prog *prog, struct bpf_prog *old) { if (cmpxchg(pprog, old, prog) != old) return -ENOENT; if (old) bpf_prog_put(old); return 0; } static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; return !!psock->saved_data_ready; } static inline bool sk_is_udp(const struct sock *sk) { return sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) /* We only have two bits so far. */ #define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) static inline bool skb_bpf_strparser(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_STRPARSER; } static inline void skb_bpf_set_strparser(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_STRPARSER; } static inline bool skb_bpf_ingress(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return sk_redir & BPF_F_INGRESS; } static inline void skb_bpf_set_ingress(struct sk_buff *skb) { skb->_sk_redir |= BPF_F_INGRESS; } static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, bool ingress) { skb->_sk_redir = (unsigned long)sk_redir; if (ingress) skb->_sk_redir |= BPF_F_INGRESS; } static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) { unsigned long sk_redir = skb->_sk_redir; return (struct sock *)(sk_redir & BPF_F_PTR_MASK); } static inline void skb_bpf_redirect_clear(struct sk_buff *skb) { skb->_sk_redir = 0; } #endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ |
| 252 252 252 252 107 69 69 69 252 252 252 252 252 252 252 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP upper layer protocol support. * * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * */ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <net/tcp.h> static DEFINE_SPINLOCK(tcp_ulp_list_lock); static LIST_HEAD(tcp_ulp_list); /* Simple linear search, don't expect many entries! */ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) { struct tcp_ulp_ops *e; list_for_each_entry_rcu(e, &tcp_ulp_list, list, lockdep_is_held(&tcp_ulp_list_lock)) { if (strcmp(e->name, name) == 0) return e; } return NULL; } static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; rcu_read_lock(); ulp = tcp_ulp_find(name); #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } #endif if (!ulp || !try_module_get(ulp->owner)) ulp = NULL; rcu_read_unlock(); return ulp; } /* Attach new upper layer protocol to the list * of available protocols. */ int tcp_register_ulp(struct tcp_ulp_ops *ulp) { int ret = 0; spin_lock(&tcp_ulp_list_lock); if (tcp_ulp_find(ulp->name)) ret = -EEXIST; else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); spin_unlock(&tcp_ulp_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_ulp); void tcp_unregister_ulp(struct tcp_ulp_ops *ulp) { spin_lock(&tcp_ulp_list_lock); list_del_rcu(&ulp->list); spin_unlock(&tcp_ulp_list_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_ulp); /* Build string with list of available upper layer protocl values */ void tcp_get_available_ulp(char *buf, size_t maxlen) { struct tcp_ulp_ops *ulp_ops; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } void tcp_update_ulp(struct sock *sk, struct proto *proto, void (*write_space)(struct sock *sk)) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops->update) icsk->icsk_ulp_ops->update(sk, proto, write_space); } void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* No sock_owned_by_me() check here as at the time the * stack calls this function, the socket is dead and * about to be destroyed. */ if (!icsk->icsk_ulp_ops) return; if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); icsk->icsk_ulp_ops = NULL; } static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); int err; err = -EEXIST; if (icsk->icsk_ulp_ops) goto out_err; if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); err = -ENOTCONN; if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) goto out_err; err = ulp_ops->init(sk); if (err) goto out_err; icsk->icsk_ulp_ops = ulp_ops; return 0; out_err: module_put(ulp_ops->owner); return err; } int tcp_set_ulp(struct sock *sk, const char *name) { const struct tcp_ulp_ops *ulp_ops; sock_owned_by_me(sk); ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; return __tcp_set_ulp(sk, ulp_ops); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKRU_H #define _ASM_X86_PKRU_H #include <asm/cpufeature.h> #define PKRU_AD_BIT 0x1u #define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS extern u32 init_pkru_value; #define pkru_get_init_value() READ_ONCE(init_pkru_value) #else #define init_pkru_value 0 #define pkru_get_init_value() 0 #endif static inline bool __pkru_allows_read(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); } static inline bool __pkru_allows_write(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; /* * Access-disable disables writes too so we need to check * both bits here. */ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); } static inline u32 read_pkru(void) { if (cpu_feature_enabled(X86_FEATURE_OSPKE)) return rdpkru(); return 0; } static inline void write_pkru(u32 pkru) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* * WRPKRU is relatively expensive compared to RDPKRU. * Avoid WRPKRU when it would not change the value. */ if (pkru != rdpkru()) wrpkru(pkru); } static inline void pkru_write_default(void) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; wrpkru(pkru_get_init_value()); } #endif |
| 1 1 1 1 1 1 35 35 35 35 2 2 442 443 443 443 295 503 439 438 1 1 1 1 1 7 7 5 5 5 5 5 5 5 7 7 492 431 358 155 493 456 425 425 88 322 322 358 1865 1047 157 275 271 358 3 163 163 162 71 71 1 1 1 1 5 4 4 2 2 1 1 1 5 2 2 2 3 3 3 24 24 24 24 24 24 24 1 1 2 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/sysctl.h> #include <linux/net.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/mpls.h> #include <linux/netconf.h> #include <linux/nospec.h> #include <linux/vmalloc.h> #include <linux/percpu.h> #include <net/gso.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/netevent.h> #include <net/ip_tunnels.h> #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #endif #include <net/ipv6_stubs.h> #include <net/rtnh.h> #include "internal.h" /* max memory we will use for mpls_route */ #define MAX_MPLS_ROUTE_MEM 4096 /* Maximum number of labels to look ahead at when selecting a path of * a multipath route */ #define MAX_MP_SELECT_LABELS 4 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int label_limit = (1 << 20) - 1; static int ttl_max = 255; #if IS_ENABLED(CONFIG_NET_IP_TUNNEL) static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct mpls_shim_hdr); } static const struct ip_tunnel_encap_ops mpls_iptun_ops = { .encap_hlen = ipgre_mpls_encap_hlen, }; static int ipgre_tunnel_encap_add_mpls_ops(void) { return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } static void ipgre_tunnel_encap_del_mpls_ops(void) { ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); } #else static int ipgre_tunnel_encap_add_mpls_ops(void) { return 0; } static void ipgre_tunnel_encap_del_mpls_ops(void) { } #endif static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) { struct mpls_route *rt = NULL; if (index < net->mpls.platform_labels) { struct mpls_route __rcu **platform_label = rcu_dereference(net->mpls.platform_label); rt = rcu_dereference(platform_label[index]); } return rt; } bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } EXPORT_SYMBOL_GPL(mpls_output_possible); static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh) { return (u8 *)nh + rt->rt_via_offset; } static const u8 *mpls_nh_via(const struct mpls_route *rt, const struct mpls_nh *nh) { return __mpls_nh_via((struct mpls_route *)rt, (struct mpls_nh *)nh); } static unsigned int mpls_nh_header_size(const struct mpls_nh *nh) { /* The size of the layer 2.5 labels to be added for this route */ return nh->nh_labels * sizeof(struct mpls_shim_hdr); } unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } EXPORT_SYMBOL_GPL(mpls_dev_mtu); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); void mpls_stats_inc_outucastpkts(struct net_device *dev, const struct sk_buff *skb) { struct mpls_dev *mdev; if (skb->protocol == htons(ETH_P_MPLS_UC)) { mdev = mpls_dev_get(dev); if (mdev) MPLS_INC_STATS_LEN(mdev, skb->len, tx_packets, tx_bytes); } else if (skb->protocol == htons(ETH_P_IP)) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { struct inet6_dev *in6dev = __in6_dev_get(dev); if (in6dev) IP6_UPD_PO_STATS(dev_net(dev), in6dev, IPSTATS_MIB_OUT, skb->len); #endif } } EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts); static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) { struct mpls_entry_decoded dec; unsigned int mpls_hdr_len = 0; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; u32 hash = 0; for (label_index = 0; label_index < MAX_MP_SELECT_LABELS; label_index++) { mpls_hdr_len += sizeof(*hdr); if (!pskb_may_pull(skb, mpls_hdr_len)) break; /* Read and decode the current label */ hdr = mpls_hdr(skb) + label_index; dec = mpls_entry_decode(hdr); /* RFC6790 - reserved labels MUST NOT be used as keys * for the load-balancing function */ if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) { hash = jhash_1word(dec.label, hash); /* The entropy label follows the entropy label * indicator, so this means that the entropy * label was just added to the hash - no need to * go any deeper either in the label stack or in the * payload */ if (eli_seen) break; } else if (dec.label == MPLS_LABEL_ENTROPY) { eli_seen = true; } if (!dec.bos) continue; /* found bottom label; does skb have room for a header? */ if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) { const struct iphdr *v4hdr; v4hdr = (const struct iphdr *)(hdr + 1); if (v4hdr->version == 4) { hash = jhash_3words(ntohl(v4hdr->saddr), ntohl(v4hdr->daddr), v4hdr->protocol, hash); } else if (v4hdr->version == 6 && pskb_may_pull(skb, mpls_hdr_len + sizeof(struct ipv6hdr))) { const struct ipv6hdr *v6hdr; v6hdr = (const struct ipv6hdr *)(hdr + 1); hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); hash = jhash_1word(v6hdr->nexthdr, hash); } } break; } return hash; } static struct mpls_nh *mpls_get_nexthop(struct mpls_route *rt, u8 index) { return (struct mpls_nh *)((u8 *)rt->rt_nh + index * rt->rt_nh_size); } /* number of alive nexthops (rt->rt_nhn_alive) and the flags for * a next hop (nh->nh_flags) are modified by netdev event handlers. * Since those fields can change at any moment, use READ_ONCE to * access both. */ static const struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, struct sk_buff *skb) { u32 hash = 0; int nh_index = 0; int n = 0; u8 alive; /* No need to look further into packet if there's only * one path */ if (rt->rt_nhn == 1) return rt->rt_nh; alive = READ_ONCE(rt->rt_nhn_alive); if (alive == 0) return NULL; hash = mpls_multipath_hash(rt, skb); nh_index = hash % alive; if (alive == rt->rt_nhn) goto out; for_nexthops(rt) { unsigned int nh_flags = READ_ONCE(nh->nh_flags); if (nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) continue; if (n == nh_index) return nh; n++; } endfor_nexthops(rt); out: return mpls_get_nexthop(rt, nh_index); } static bool mpls_egress(struct net *net, struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; /* The IPv4 code below accesses through the IPv4 header * checksum, which is 12 bytes into the packet. * The IPv6 code below accesses through the IPv6 hop limit * which is 8 bytes into the packet. * * For all supported cases there should always be at least 12 * bytes of packet data present. The IPv4 header is 20 bytes * without options and the IPv6 header is always 40 bytes * long. */ if (!pskb_may_pull(skb, 12)) return false; payload_type = rt->rt_payload_type; if (payload_type == MPT_UNSPEC) payload_type = ip_hdr(skb)->version; switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); u8 new_ttl; skb->protocol = htons(ETH_P_IP); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * TTL, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) new_ttl = dec.ttl; else new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), htons(new_ttl << 8)); hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); /* If propagating TTL, take the decremented TTL from * the incoming MPLS header, otherwise decrement the * hop limit, but only if not 0 to avoid underflow. */ if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && net->mpls.ip_ttl_propagate)) hdr6->hop_limit = dec.ttl; else if (hdr6->hop_limit) hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: /* Should have decided which protocol it is by now */ break; } return success; } static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); struct mpls_shim_hdr *hdr; const struct mpls_nh *nh; struct mpls_route *rt; struct mpls_entry_decoded dec; struct net_device *out_dev; struct mpls_dev *out_mdev; struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; int err; /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_get(dev); if (!mdev) goto drop; MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets, rx_bytes); if (!mdev->input_enabled) { MPLS_INC_STATS(mdev, rx_dropped); goto drop; } if (skb->pkt_type != PACKET_HOST) goto err; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto err; if (!pskb_may_pull(skb, sizeof(*hdr))) goto err; skb_dst_drop(skb); /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); rt = mpls_route_input_rcu(net, dec.label); if (!rt) { MPLS_INC_STATS(mdev, rx_noroute); goto drop; } nh = mpls_select_multipath(rt, skb); if (!nh) goto err; /* Pop the label */ skb_pull(skb, sizeof(*hdr)); skb_reset_network_header(skb); skb_orphan(skb); if (skb_warn_if_lro(skb)) goto err; skb_forward_csum(skb); /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; /* Find the output device */ out_dev = nh->nh_dev; if (!mpls_output_possible(out_dev)) goto tx_err; /* Verify the destination can hold the packet */ new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto tx_err; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) goto tx_err; skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) goto err; } else { bool bos; int i; skb_push(skb, new_header_size); skb_reset_network_header(skb); /* Push the new labels */ hdr = mpls_hdr(skb); bos = dec.bos; for (i = nh->nh_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(nh->nh_label[i], dec.ttl, 0, bos); bos = false; } } mpls_stats_inc_outucastpkts(out_dev, skb); /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, out_dev->dev_addr, skb); else err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return 0; tx_err: out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); goto drop; err: MPLS_INC_STATS(mdev, rx_errors); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type mpls_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .func = mpls_forward, }; static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { u32 rc_protocol; u32 rc_ifindex; u8 rc_via_table; u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; enum mpls_payload_type rc_payload_type; struct nl_info rc_nlinfo; struct rtnexthop *rc_mp; int rc_mp_len; }; /* all nexthops within a route have the same size based on max * number of labels and max via length for a hop */ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) { u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen); struct mpls_route *rt; size_t size; size = sizeof(*rt) + num_nh * nh_size; if (size > MAX_MPLS_ROUTE_MEM) return ERR_PTR(-EINVAL); rt = kzalloc(size, GFP_KERNEL); if (!rt) return ERR_PTR(-ENOMEM); rt->rt_nhn = num_nh; rt->rt_nhn_alive = num_nh; rt->rt_nh_size = nh_size; rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels); return rt; } static void mpls_rt_free(struct mpls_route *rt) { if (rt) kfree_rcu(rt, rt_rcu); } static void mpls_notify_route(struct net *net, unsigned index, struct mpls_route *old, struct mpls_route *new, const struct nl_info *info) { struct nlmsghdr *nlh = info ? info->nlh : NULL; unsigned portid = info ? info->portid : 0; int event = new ? RTM_NEWROUTE : RTM_DELROUTE; struct mpls_route *rt = new ? new : old; unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; /* Ignore reserved labels for now */ if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED)) rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); } static void mpls_route_update(struct net *net, unsigned index, struct mpls_route *new, const struct nl_info *info) { struct mpls_route __rcu **platform_label; struct mpls_route *rt; ASSERT_RTNL(); platform_label = rtnl_dereference(net->mpls.platform_label); rt = rtnl_dereference(platform_label[index]); rcu_assign_pointer(platform_label[index], new); mpls_notify_route(net, index, rt, new, info); /* If we removed a route free it now */ mpls_rt_free(rt); } static unsigned find_free_label(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels; index++) { if (!rtnl_dereference(platform_label[index])) return index; } return LABEL_NOT_SPECIFIED; } #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, const void *addr) { struct net_device *dev; struct rtable *rt; struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; dev_hold(dev); ip_rt_put(rt); return dev; } #else static struct net_device *inet_fib_lookup_dev(struct net *net, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, const void *addr) { struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; if (!ipv6_stub) return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) return ERR_CAST(dst); dev = dst->dev; dev_hold(dev); dst_release(dst); return dev; } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); } #endif static struct net_device *find_outdev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; if (!oif) { switch (nh->nh_via_table) { case NEIGH_ARP_TABLE: dev = inet_fib_lookup_dev(net, mpls_nh_via(rt, nh)); break; case NEIGH_ND_TABLE: dev = inet6_fib_lookup_dev(net, mpls_nh_via(rt, nh)); break; case NEIGH_LINK_TABLE: break; } } else { dev = dev_get_by_index(net, oif); } if (!dev) return ERR_PTR(-ENODEV); if (IS_ERR(dev)) return dev; /* The caller is holding rtnl anyways, so release the dev reference */ dev_put(dev); return dev; } static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif) { struct net_device *dev = NULL; int err = -ENODEV; dev = find_outdev(net, rt, nh, oif); if (IS_ERR(dev)) { err = PTR_ERR(dev); dev = NULL; goto errout; } /* Ensure this is a supported device */ err = -EINVAL; if (!mpls_dev_get(dev)) goto errout; if ((nh->nh_via_table == NEIGH_LINK_TABLE) && (dev->addr_len != nh->nh_via_alen)) goto errout; nh->nh_dev = dev; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; } else { unsigned int flags; flags = dev_get_flags(dev); if (!(flags & (IFF_RUNNING | IFF_LOWER_UP))) nh->nh_flags |= RTNH_F_LINKDOWN; } return 0; errout: return err; } static int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, u8 via_addr[], struct netlink_ext_ack *extack) { struct rtvia *via = nla_data(nla); int err = -EINVAL; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid attribute length for RTA_VIA"); goto errout; } alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (alen > MAX_VIA_ALEN) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid address length for RTA_VIA"); goto errout; } /* Validate the address family */ switch (via->rtvia_family) { case AF_PACKET: *via_table = NEIGH_LINK_TABLE; break; case AF_INET: *via_table = NEIGH_ARP_TABLE; if (alen != 4) goto errout; break; case AF_INET6: *via_table = NEIGH_ND_TABLE; if (alen != 16) goto errout; break; default: /* Unsupported address family */ goto errout; } memcpy(via_addr, via->rtvia_addr, alen); *via_alen = alen; err = 0; errout: return err; } static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, struct mpls_route *rt) { struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_nh *nh = rt->rt_nh; int err; int i; if (!nh) return -ENOMEM; nh->nh_labels = cfg->rc_output_labels; for (i = 0; i < nh->nh_labels; i++) nh->nh_label[i] = cfg->rc_output_label[i]; nh->nh_via_table = cfg->rc_via_table; memcpy(__mpls_nh_via(rt, nh), cfg->rc_via, cfg->rc_via_alen); nh->nh_via_alen = cfg->rc_via_alen; err = mpls_nh_assign_dev(net, rt, nh, cfg->rc_ifindex); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; return 0; errout: return err; } static int mpls_nh_build(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif, struct nlattr *via, struct nlattr *newdst, u8 max_labels, struct netlink_ext_ack *extack) { int err = -ENOMEM; if (!nh) goto errout; if (newdst) { err = nla_get_labels(newdst, max_labels, &nh->nh_labels, nh->nh_label, extack); if (err) goto errout; } if (via) { err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, __mpls_nh_via(rt, nh), extack); if (err) goto errout; } else { nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC; } err = mpls_nh_assign_dev(net, rt, nh, oif); if (err) goto errout; return 0; errout: return err; } static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, u8 cfg_via_alen, u8 *max_via_alen, u8 *max_labels) { int remaining = len; u8 nhs = 0; *max_via_alen = 0; *max_labels = 0; while (rtnh_ok(rtnh, remaining)) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); int attrlen; u8 n_labels = 0; attrlen = rtnh_attrlen(rtnh); nla = nla_find(attrs, attrlen, RTA_VIA); if (nla && nla_len(nla) >= offsetof(struct rtvia, rtvia_addr)) { int via_alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); if (via_alen <= MAX_VIA_ALEN) *max_via_alen = max_t(u16, *max_via_alen, via_alen); } nla = nla_find(attrs, attrlen, RTA_NEWDST); if (nla && nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL, NULL) != 0) return 0; *max_labels = max_t(u8, *max_labels, n_labels); /* number of nexthops is tracked by a u8. * Check for overflow. */ if (nhs == 255) return 0; nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ return remaining > 0 ? 0 : nhs; } static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct mpls_route *rt, u8 max_labels, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; int err = 0; u8 nhs = 0; change_nexthops(rt) { int attrlen; nla_via = NULL; nla_newdst = NULL; err = -EINVAL; if (!rtnh_ok(rtnh, remaining)) goto errout; /* neither weighted multipath nor any flags * are supported */ if (rtnh->rtnh_hops || rtnh->rtnh_flags) goto errout; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *attrs = rtnh_attrs(rtnh); nla_via = nla_find(attrs, attrlen, RTA_VIA); nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); } err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, rtnh->rtnh_ifindex, nla_via, nla_newdst, max_labels, extack); if (err) goto errout; if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) rt->rt_nhn_alive--; rtnh = rtnh_next(rtnh, &remaining); nhs++; } endfor_nexthops(rt); rt->rt_nhn = nhs; return 0; errout: return err; } static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { bool is_ok = true; /* Reserved labels may not be set */ if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); is_ok = false; } /* The full 20 bit range may not be supported. */ if (is_ok && *index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); is_ok = false; } *index = array_index_nospec(*index, net->mpls.platform_labels); return is_ok; } static int mpls_route_add(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct mpls_route __rcu **platform_label; struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_route *rt, *old; int err = -EINVAL; u8 max_via_alen; unsigned index; u8 max_labels; u8 nhs; index = cfg->rc_label; /* If a label was not specified during insert pick one */ if ((index == LABEL_NOT_SPECIFIED) && (cfg->rc_nlflags & NLM_F_CREATE)) { index = find_free_label(net); } if (!mpls_label_ok(net, &index, extack)) goto errout; /* Append makes no sense with mpls */ err = -EOPNOTSUPP; if (cfg->rc_nlflags & NLM_F_APPEND) { NL_SET_ERR_MSG(extack, "MPLS does not support route append"); goto errout; } err = -EEXIST; platform_label = rtnl_dereference(net->mpls.platform_label); old = rtnl_dereference(platform_label[index]); if ((cfg->rc_nlflags & NLM_F_EXCL) && old) goto errout; err = -EEXIST; if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old) goto errout; err = -ENOENT; if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) goto errout; err = -EINVAL; if (cfg->rc_mp) { nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, cfg->rc_via_alen, &max_via_alen, &max_labels); } else { max_via_alen = cfg->rc_via_alen; max_labels = cfg->rc_output_labels; nhs = 1; } if (nhs == 0) { NL_SET_ERR_MSG(extack, "Route does not contain a nexthop"); goto errout; } rt = mpls_rt_alloc(nhs, max_via_alen, max_labels); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto errout; } rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) err = mpls_nh_build_multi(cfg, rt, max_labels, extack); else err = mpls_nh_build_from_cfg(cfg, rt); if (err) goto freert; mpls_route_update(net, index, rt, &cfg->rc_nlinfo); return 0; freert: mpls_rt_free(rt); errout: return err; } static int mpls_route_del(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct net *net = cfg->rc_nlinfo.nl_net; unsigned index; int err = -EINVAL; index = cfg->rc_label; if (!mpls_label_ok(net, &index, extack)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); err = 0; errout: return err; } static void mpls_get_stats(struct mpls_dev *mdev, struct mpls_link_stats *stats) { struct mpls_pcpu_stats *p; int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct mpls_link_stats local; unsigned int start; p = per_cpu_ptr(mdev->stats, i); do { start = u64_stats_fetch_begin(&p->syncp); local = p->stats; } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; stats->tx_packets += local.tx_packets; stats->tx_bytes += local.tx_bytes; stats->rx_errors += local.rx_errors; stats->tx_errors += local.tx_errors; stats->rx_dropped += local.rx_dropped; stats->tx_dropped += local.tx_dropped; stats->rx_noroute += local.rx_noroute; } } static int mpls_fill_stats_af(struct sk_buff *skb, const struct net_device *dev) { struct mpls_link_stats *stats; struct mpls_dev *mdev; struct nlattr *nla; mdev = mpls_dev_get(dev); if (!mdev) return -ENODATA; nla = nla_reserve_64bit(skb, MPLS_STATS_LINK, sizeof(struct mpls_link_stats), MPLS_STATS_UNSPEC); if (!nla) return -EMSGSIZE; stats = nla_data(nla); mpls_get_stats(mdev, stats); return 0; } static size_t mpls_get_stats_af_size(const struct net_device *dev) { struct mpls_dev *mdev; mdev = mpls_dev_get(dev); if (!mdev) return 0; return nla_total_size_64bit(sizeof(struct mpls_link_stats)); } static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_MPLS; if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0) goto nla_put_failure; if ((all || type == NETCONFA_INPUT) && nla_put_s32(skb, NETCONFA_INPUT, mdev->input_enabled) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int mpls_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_INPUT) size += nla_total_size(4); return size; } static void mpls_netconf_notify_devconf(struct net *net, int event, int type, struct mpls_dev *mdev) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); } static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, }; static int mpls_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_mpls_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int mpls_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; struct net_device *dev; struct mpls_dev *mdev; struct sk_buff *skb; int ifindex; int err; err = mpls_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); dev = __dev_get_by_index(net, ifindex); if (!dev) goto errout; mdev = mpls_dev_get(dev); if (!mdev) goto errout; err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } static int mpls_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct hlist_head *head; struct net_device *dev; struct mpls_dev *mdev; int idx, s_idx; int h, s_h; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); return -EINVAL; } } s_h = cb->args[0]; s_idx = idx = cb->args[1]; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; rcu_read_lock(); cb->seq = net->dev_base_seq; hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; mdev = mpls_dev_get(dev); if (!mdev) goto cont; if (mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } rcu_read_unlock(); } done: cb->args[0] = h; cb->args[1] = idx; return skb->len; } #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { struct mpls_dev *mdev = ctl->extra1; int i = (int *)ctl->data - (int *)mdev; struct net *net = ctl->extra2; int val = *(int *)ctl->data; if (i == offsetof(struct mpls_dev, input_enabled) && val != oval) { mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_INPUT, mdev); } } return ret; } static const struct ctl_table mpls_dev_table[] = { { .procname = "input", .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, { } }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; struct net *net = dev_net(dev); struct ctl_table *table; int i; table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); if (!table) goto out; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; } snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); mdev->sysctl = register_net_sysctl_sz(net, path, table, ARRAY_SIZE(mpls_dev_table)); if (!mdev->sysctl) goto free; mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, mdev); return 0; free: kfree(table); out: mdev->sysctl = NULL; return -ENOBUFS; } static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); struct ctl_table *table; if (!mdev->sysctl) return; table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); mpls_netconf_notify_devconf(net, RTM_DELNETCONF, 0, mdev); } static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; int err = -ENOMEM; int i; ASSERT_RTNL(); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(err); mdev->stats = alloc_percpu(struct mpls_pcpu_stats); if (!mdev->stats) goto free; for_each_possible_cpu(i) { struct mpls_pcpu_stats *mpls_stats; mpls_stats = per_cpu_ptr(mdev->stats, i); u64_stats_init(&mpls_stats->syncp); } mdev->dev = dev; err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; free: free_percpu(mdev->stats); kfree(mdev); return ERR_PTR(err); } static void mpls_dev_destroy_rcu(struct rcu_head *head) { struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu); free_percpu(mdev->stats); kfree(mdev); } static int mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); bool nh_del = false; u8 alive = 0; if (!rt) continue; if (event == NETDEV_UNREGISTER) { u8 deleted = 0; for_nexthops(rt) { if (!nh->nh_dev || nh->nh_dev == dev) deleted++; if (nh->nh_dev == dev) nh_del = true; } endfor_nexthops(rt); /* if there are no more nexthops, delete the route */ if (deleted == rt->rt_nhn) { mpls_route_update(net, index, NULL, NULL); continue; } if (nh_del) { size_t size = sizeof(*rt) + rt->rt_nhn * rt->rt_nh_size; struct mpls_route *orig = rt; rt = kmemdup(orig, size, GFP_KERNEL); if (!rt) return -ENOMEM; } } change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (nh->nh_dev != dev) goto next; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nh_flags |= RTNH_F_LINKDOWN; break; } if (event == NETDEV_UNREGISTER) nh->nh_dev = NULL; if (nh->nh_flags != nh_flags) WRITE_ONCE(nh->nh_flags, nh_flags); next: if (!(nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) alive++; } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); if (nh_del) mpls_route_update(net, index, rt, NULL); } return 0; } static void mpls_ifup(struct net_device *dev, unsigned int flags) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); unsigned index; u8 alive; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); if (!rt) continue; alive = 0; change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; if (!(nh_flags & flags)) { alive++; continue; } if (nh->nh_dev != dev) continue; alive++; nh_flags &= ~flags; WRITE_ONCE(nh->nh_flags, nh_flags); } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); } } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpls_dev *mdev; unsigned int flags; int err; if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); return NOTIFY_OK; } mdev = mpls_dev_get(dev); if (!mdev) return NOTIFY_OK; switch (event) { case NETDEV_DOWN: err = mpls_ifdown(dev, event); if (err) return notifier_from_errno(err); break; case NETDEV_UP: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); else mpls_ifup(dev, RTNH_F_DEAD); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) { mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); } else { err = mpls_ifdown(dev, event); if (err) return notifier_from_errno(err); } break; case NETDEV_UNREGISTER: err = mpls_ifdown(dev, event); if (err) return notifier_from_errno(err); mdev = mpls_dev_get(dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } break; case NETDEV_CHANGENAME: mdev = mpls_dev_get(dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) return notifier_from_errno(err); } break; } return NOTIFY_OK; } static struct notifier_block mpls_dev_notifier = { .notifier_call = mpls_dev_notify, }; static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { AF_INET, AF_INET6, AF_DECnet, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; int family = AF_UNSPEC; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) return -EMSGSIZE; if (table <= NEIGH_NR_TABLES) family = table_to_family[table]; via = nla_data(nla); via->rtvia_family = family; memcpy(via->rtvia_addr, addr, alen); return 0; } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]) { struct nlattr *nla; struct mpls_shim_hdr *nla_label; bool bos; int i; nla = nla_reserve(skb, attrtype, labels*4); if (!nla) return -EMSGSIZE; nla_label = nla_data(nla); bos = true; for (i = labels - 1; i >= 0; i--) { nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos); bos = false; } return 0; } EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[], struct netlink_ext_ack *extack) { unsigned len = nla_len(nla); struct mpls_shim_hdr *nla_label; u8 nla_labels; bool bos; int i; /* len needs to be an even multiple of 4 (the label size). Number * of labels is a u8 so check for overflow. */ if (len & 3 || len / 4 > 255) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid length for labels attribute"); return -EINVAL; } /* Limit the number of new labels allowed */ nla_labels = len/4; if (nla_labels > max_labels) { NL_SET_ERR_MSG(extack, "Too many labels"); return -EINVAL; } /* when label == NULL, caller wants number of labels */ if (!label) goto out; nla_label = nla_data(nla); bos = true; for (i = nla_labels - 1; i >= 0; i--, bos = false) { struct mpls_entry_decoded dec; dec = mpls_entry_decode(nla_label + i); /* Ensure the bottom of stack flag is properly set * and ttl and tc are both clear. */ if (dec.ttl) { NL_SET_ERR_MSG_ATTR(extack, nla, "TTL in label must be 0"); return -EINVAL; } if (dec.tc) { NL_SET_ERR_MSG_ATTR(extack, nla, "Traffic class in label must be 0"); return -EINVAL; } if (dec.bos != bos) { NL_SET_BAD_ATTR(extack, nla); if (bos) { NL_SET_ERR_MSG(extack, "BOS bit must be set in first label"); } else { NL_SET_ERR_MSG(extack, "BOS bit can only be set in first label"); } return -EINVAL; } switch (dec.label) { case MPLS_LABEL_IMPLNULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. */ NL_SET_ERR_MSG_ATTR(extack, nla, "Implicit NULL Label (3) can not be used in encapsulation"); return -EINVAL; } label[i] = dec.label; } out: *labels = nla_labels; return 0; } EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; int index; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_family != AF_MPLS) { NL_SET_ERR_MSG(extack, "Invalid address family in rtmsg"); goto errout; } if (rtm->rtm_dst_len != 20) { NL_SET_ERR_MSG(extack, "rtm_dst_len must be 20 for MPLS"); goto errout; } if (rtm->rtm_src_len != 0) { NL_SET_ERR_MSG(extack, "rtm_src_len must be 0 for MPLS"); goto errout; } if (rtm->rtm_tos != 0) { NL_SET_ERR_MSG(extack, "rtm_tos must be 0 for MPLS"); goto errout; } if (rtm->rtm_table != RT_TABLE_MAIN) { NL_SET_ERR_MSG(extack, "MPLS only supports the main route table"); goto errout; } /* Any value is acceptable for rtm_protocol */ /* As mpls uses destination specific addresses * (or source specific address in the case of multicast) * all addresses have universal scope. */ if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) { NL_SET_ERR_MSG(extack, "Invalid route scope - MPLS only supports UNIVERSE"); goto errout; } if (rtm->rtm_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Invalid route type - MPLS only supports UNICAST"); goto errout; } if (rtm->rtm_flags != 0) { NL_SET_ERR_MSG(extack, "rtm_flags must be 0 for MPLS"); goto errout; } cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; cfg->rc_nlinfo.nl_net = sock_net(skb->sk); for (index = 0; index <= RTA_MAX; index++) { struct nlattr *nla = tb[index]; if (!nla) continue; switch (index) { case RTA_OIF: cfg->rc_ifindex = nla_get_u32(nla); break; case RTA_NEWDST: if (nla_get_labels(nla, MAX_NEW_LABELS, &cfg->rc_output_labels, cfg->rc_output_label, extack)) goto errout; break; case RTA_DST: { u8 label_count; if (nla_get_labels(nla, 1, &label_count, &cfg->rc_label, extack)) goto errout; if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, &cfg->rc_label, extack)) goto errout; break; } case RTA_GATEWAY: NL_SET_ERR_MSG(extack, "MPLS does not support RTA_GATEWAY attribute"); goto errout; case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, &cfg->rc_via_table, cfg->rc_via, extack)) goto errout; break; } case RTA_MULTIPATH: { cfg->rc_mp = nla_data(nla); cfg->rc_mp_len = nla_len(nla); break; } case RTA_TTL_PROPAGATE: { u8 ttl_propagate = nla_get_u8(nla); if (ttl_propagate > 1) { NL_SET_ERR_MSG_ATTR(extack, nla, "RTA_TTL_PROPAGATE can only be 0 or 1"); goto errout; } cfg->rc_ttl_propagate = ttl_propagate ? MPLS_TTL_PROP_ENABLED : MPLS_TTL_PROP_DISABLED; break; } default: NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute"); /* Unsupported attribute */ goto errout; } } err = 0; errout: return err; } static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct mpls_route_config *cfg; int err; cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; err = mpls_route_del(cfg, extack); out: kfree(cfg); return err; } static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct mpls_route_config *cfg; int err; cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); if (!cfg) return -ENOMEM; err = rtm_to_route_config(skb, nlh, cfg, extack); if (err < 0) goto out; err = mpls_route_add(cfg, extack); out: kfree(cfg); return err; } static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 label, struct mpls_route *rt, int flags) { struct net_device *dev; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_MPLS; rtm->rtm_dst_len = 20; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = rt->rt_protocol; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { bool ttl_propagate = rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; if (nla_put_u8(skb, RTA_TTL_PROPAGATE, ttl_propagate)) goto nla_put_failure; } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (nh->nh_flags & RTNH_F_LINKDOWN) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (nh->nh_flags & RTNH_F_DEAD) rtm->rtm_flags |= RTNH_F_DEAD; } else { struct rtnexthop *rtnh; struct nlattr *mp; u8 linkdown = 0; u8 dead = 0; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; for_nexthops(rt) { dev = nh->nh_dev; if (!dev) continue; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_ifindex = dev->ifindex; if (nh->nh_flags & RTNH_F_LINKDOWN) { rtnh->rtnh_flags |= RTNH_F_LINKDOWN; linkdown++; } if (nh->nh_flags & RTNH_F_DEAD) { rtnh->rtnh_flags |= RTNH_F_DEAD; dead++; } if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; } endfor_nexthops(rt); if (linkdown == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_LINKDOWN; if (dead == rt->rt_nhn) rtm->rtm_flags |= RTNH_F_DEAD; nla_nest_end(skb, mp); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } #if IS_ENABLED(CONFIG_INET) static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { return ip_valid_fib_dump_req(net, nlh, filter, cb); } #else static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_protocol) { filter->protocol = rtm->rtm_protocol; filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (i == RTA_OIF) { ifindex = nla_get_u32(tb[i]); filter->dev = __dev_get_by_index(net, ifindex); if (!filter->dev) return -ENODEV; filter->filter_set = 1; } else if (tb[i]) { NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } #endif static bool mpls_rt_uses_dev(struct mpls_route *rt, const struct net_device *dev) { if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev == dev) return true; } else { for_nexthops(rt) { if (nh->nh_dev == dev) return true; } endfor_nexthops(rt); } return false; } static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; struct fib_dump_filter filter = {}; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; ASSERT_RTNL(); if (cb->strict_check) { int err; err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) return err; /* for MPLS, there is only 1 table with fixed type and flags. * If either are set in the filter then return nothing. */ if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || (filter.rt_type && filter.rt_type != RTN_UNICAST) || filter.flags) return skb->len; } index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; if (filter.filter_set) flags |= NLM_F_DUMP_FILTERED; for (; index < platform_labels; index++) { struct mpls_route *rt; rt = rtnl_dereference(platform_label[index]); if (!rt) continue; if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) || (filter.protocol && rt->rt_protocol != filter.protocol)) continue; if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, index, rt, flags) < 0) break; } cb->args[0] = index; return skb->len; } static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_DST */ + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; if (nh->nh_dev) payload += nla_total_size(4); /* RTA_OIF */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */ payload += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) /* RTA_NEWDST */ payload += nla_total_size(nh->nh_labels * 4); } else { /* each nexthop is packed in an attribute */ size_t nhsize = 0; for_nexthops(rt) { if (!nh->nh_dev) continue; nhsize += nla_total_size(sizeof(struct rtnexthop)); /* RTA_VIA */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) nhsize += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) nhsize += nla_total_size(nh->nh_labels * 4); } endfor_nexthops(rt); /* nested attribute */ payload += nla_total_size(nhsize); } return payload; } static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags) { struct sk_buff *skb; u32 seq = nlh ? nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (skb == NULL) goto errout; err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); } static int mpls_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); if (err) return err; if ((tb[RTA_DST] || tb[RTA_NEWDST]) && !rtm->rtm_dst_len) { NL_SET_ERR_MSG_MOD(extack, "rtm_dst_len must be 20 for MPLS"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_DST: case RTA_NEWDST: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 portid = NETLINK_CB(in_skb).portid; u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; unsigned int hdr_size = 0; const struct mpls_nh *nh; struct net_device *dev; struct mpls_route *rt; struct rtmsg *rtm, *r; struct nlmsghdr *nlh; struct sk_buff *skb; u8 n_labels; int err; err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; rtm = nlmsg_data(in_nlh); if (tb[RTA_DST]) { u8 label_count; if (nla_get_labels(tb[RTA_DST], 1, &label_count, &in_label, extack)) { err = -EINVAL; goto errout; } if (!mpls_label_ok(net, &in_label, extack)) { err = -EINVAL; goto errout; } } rt = mpls_route_input_rcu(net, in_label); if (!rt) { err = -ENETUNREACH; goto errout; } if (rtm->rtm_flags & RTM_F_FIB_MATCH) { skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = mpls_dump_route(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, in_label, rt, 0); if (err < 0) { /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, portid); } if (tb[RTA_NEWDST]) { if (nla_get_labels(tb[RTA_NEWDST], MAX_NEW_LABELS, &n_labels, labels, extack) != 0) { err = -EINVAL; goto errout; } hdr_size = n_labels * sizeof(struct mpls_shim_hdr); } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } skb->protocol = htons(ETH_P_MPLS_UC); if (hdr_size) { bool bos; int i; if (skb_cow(skb, hdr_size)) { err = -ENOBUFS; goto errout_free; } skb_reserve(skb, hdr_size); skb_push(skb, hdr_size); skb_reset_network_header(skb); /* Push new labels */ hdr = mpls_hdr(skb); bos = true; for (i = n_labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(labels[i], 1, 0, bos); bos = false; } } nh = mpls_select_multipath(rt, skb); if (!nh) { err = -ENETUNREACH; goto errout_free; } if (hdr_size) { skb_pull(skb, hdr_size); skb_reset_network_header(skb); } nlh = nlmsg_put(skb, portid, in_nlh->nlmsg_seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) { err = -EMSGSIZE; goto errout_free; } r = nlmsg_data(nlh); r->rtm_family = AF_MPLS; r->rtm_dst_len = 20; r->rtm_src_len = 0; r->rtm_table = RT_TABLE_MAIN; r->rtm_type = RTN_UNICAST; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = rt->rt_protocol; r->rtm_flags = 0; if (nla_put_labels(skb, RTA_DST, 1, &in_label)) goto nla_put_failure; if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); err = rtnl_unicast(skb, net, portid); errout: return err; nla_put_failure: nlmsg_cancel(skb, nlh); err = -EMSGSIZE; errout_free: kfree_skb(skb); return err; } static int resize_platform_label_table(struct net *net, size_t limit) { size_t size = sizeof(struct mpls_route *) * limit; size_t old_limit; size_t cp_size; struct mpls_route __rcu **labels = NULL, **old; struct mpls_route *rt0 = NULL, *rt2 = NULL; unsigned index; if (size) { labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; rt0->rt_nh->nh_dev = lo; rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, lo->addr_len); } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; rt2->rt_nh->nh_dev = lo; rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, lo->addr_len); } rtnl_lock(); /* Remember the original table */ old = rtnl_dereference(net->mpls.platform_label); old_limit = net->mpls.platform_labels; /* Free any labels beyond the new table */ for (index = limit; index < old_limit; index++) mpls_route_update(net, index, NULL, NULL); /* Copy over the old labels */ cp_size = size; if (old_limit < limit) cp_size = old_limit * sizeof(struct mpls_route *); memcpy(labels, old, cp_size); /* If needed set the predefined labels */ if ((old_limit <= MPLS_LABEL_IPV6NULL) && (limit > MPLS_LABEL_IPV6NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } if ((old_limit <= MPLS_LABEL_IPV4NULL) && (limit > MPLS_LABEL_IPV4NULL)) { RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } /* Update the global pointers */ net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); rtnl_unlock(); mpls_rt_free(rt2); mpls_rt_free(rt0); if (old) { synchronize_rcu(); kvfree(old); } return 0; nort2: mpls_rt_free(rt0); nort0: kvfree(labels); nolabels: return -ENOMEM; } static int mpls_platform_labels(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; int ret; struct ctl_table tmp = { .procname = table->procname, .data = &platform_labels, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = &label_limit, }; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = resize_platform_label_table(net, platform_labels); return ret; } #define MPLS_NS_SYSCTL_OFFSET(field) \ (&((struct net *)0)->field) static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = mpls_platform_labels, }, { .procname = "ip_ttl_propagate", .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "default_ttl", .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, { } }; static int mpls_net_init(struct net *net) { struct ctl_table *table; int i; net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, ARRAY_SIZE(mpls_table)); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; } return 0; } static void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; unregister_net_sysctl_table(net->mpls.ctl); kfree(table); /* An rcu grace period has passed since there was a device in * the network namespace (and thus the last in flight packet) * left this network namespace. This is because * unregister_netdevice_many and netdev_run_todo has completed * for each network device that was in this network namespace. * * As such no additional rcu synchronization is necessary when * freeing the platform_label table. */ rtnl_lock(); platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; for (index = 0; index < platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); RCU_INIT_POINTER(platform_label[index], NULL); mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } rtnl_unlock(); kvfree(platform_label); } static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { .family = AF_MPLS, .fill_stats_af = mpls_fill_stats_af, .get_stats_af_size = mpls_get_stats_af_size, }; static int __init mpls_init(void) { int err; BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); err = register_pernet_subsys(&mpls_net_ops); if (err) goto out; err = register_netdevice_notifier(&mpls_dev_notifier); if (err) goto out_unregister_pernet; dev_add_pack(&mpls_packet_type); rtnl_af_register(&mpls_af_ops); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, 0); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, 0); err = ipgre_tunnel_encap_add_mpls_ops(); if (err) pr_err("Can't add mpls over gre tunnel ops\n"); err = 0; out: return err; out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); goto out; } module_init(mpls_init); static void __exit mpls_exit(void) { rtnl_unregister_all(PF_MPLS); rtnl_af_unregister(&mpls_af_ops); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); MODULE_DESCRIPTION("MultiProtocol Label Switching"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_MPLS); |
| 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@CAMBRIDGE.REDHAT.COM> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe_s(const struct key *, struct seq_file *); /* * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the * description and the key material as the payload. */ struct key_type key_type_rxrpc_s = { .name = "rxrpc_s", .flags = KEY_TYPE_NET_DOMAIN, .vet_description = rxrpc_vet_description_s, .preparse = rxrpc_preparse_s, .free_preparse = rxrpc_free_preparse_s, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy_s, .describe = rxrpc_describe_s, }; /* * Vet the description for an RxRPC server key. */ static int rxrpc_vet_description_s(const char *desc) { unsigned long service, sec_class; char *p; service = simple_strtoul(desc, &p, 10); if (*p != ':' || service > 65535) return -EINVAL; sec_class = simple_strtoul(p + 1, &p, 10); if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) return -EINVAL; return 0; } /* * Preparse a server secret key. */ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec; unsigned int service, sec_class; int n; _enter("%zu", prep->datalen); if (!prep->orig_description) return -EINVAL; if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) return -EINVAL; sec = rxrpc_security_lookup(sec_class); if (!sec) return -ENOPKG; prep->payload.data[1] = (struct rxrpc_security *)sec; if (!sec->preparse_server_key) return -EINVAL; return sec->preparse_server_key(prep); } static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec = prep->payload.data[1]; if (sec && sec->free_preparse_server_key) sec->free_preparse_server_key(prep); } static void rxrpc_destroy_s(struct key *key) { const struct rxrpc_security *sec = key->payload.data[1]; if (sec && sec->destroy_server_key) sec->destroy_server_key(key); } static void rxrpc_describe_s(const struct key *key, struct seq_file *m) { const struct rxrpc_security *sec = key->payload.data[1]; seq_puts(m, key->description); if (sec && sec->describe_server_key) sec->describe_server_key(key, m); } /* * grab the security keyring for a server socket */ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->securities = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /** * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service * @sk: The socket to set the keyring on * @keyring: The keyring to set * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; lock_sock(sk); if (rx->securities) ret = -EINVAL; else if (rx->sk.sk_state != RXRPC_UNBOUND) ret = -EISCONN; else rx->securities = key_get(keyring); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); |
| 805 804 531 530 530 389 431 431 431 68 74 3 409 408 409 476 475 475 476 474 177 5 6 35 35 12 35 1 35 35 24 24 24 24 33 10 5 5 35 36 33 23 36 33 33 36 36 36 35 35 35 31 35 36 36 409 409 409 63 1 1 9 409 409 377 21 372 26 20 322 119 385 46 8 8 38 62 60 60 27 55 60 52 12 2 2 2 314 314 314 95 95 314 314 314 177 177 139 30 177 314 400 17 400 293 358 357 17 352 9 60 1 60 2 2 291 1 4 293 298 298 93 375 375 366 366 368 366 84 367 368 367 67 9 2 9 4 9 4 9 9 9 350 2 450 450 450 358 409 26 26 3 409 406 25 10 10 6 4 17 19 19 1 18 374 374 65 8 7 8 7 7 7 11 408 71 71 10 6 816 818 817 595 734 816 817 817 817 817 818 818 818 2 818 433 433 815 816 817 818 50 50 50 50 50 50 18 13 17 3 50 50 5 5 4 2 50 158 158 158 158 157 7 1 6 158 172 17 172 172 172 172 172 172 171 172 172 172 172 77 167 151 151 172 167 2 2 167 167 132 124 124 167 166 167 145 145 144 145 144 94 74 167 165 165 305 305 177 177 6 305 305 305 273 273 272 266 305 177 167 304 305 255 305 305 305 301 304 305 305 304 305 2 253 807 808 807 807 806 251 805 806 805 805 808 806 808 808 807 806 808 807 808 808 808 251 251 808 807 802 801 806 806 773 229 229 739 773 773 741 741 272 741 9 9 741 773 773 772 773 690 229 234 27 27 27 235 235 235 235 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; if (cb->strict_check) { int err; err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } if (!cb->args[0]) { res = fib6_dump_table(tb, skb, cb); if (!res) cb->args[0] = 1; } goto out; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) goto out_unlock; next: e++; } } out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); return res; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; RT6_TRACE("fib6_del_route\n"); /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright 2023 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __HCI_H #define __HCI_H #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) #define HCI_LINK_KEY_SIZE 16 #define HCI_AMP_LINK_KEY_SIZE (2 * HCI_LINK_KEY_SIZE) #define HCI_MAX_AMP_ASSOC_SIZE 672 #define HCI_MAX_CPB_DATA_SIZE 252 /* HCI dev events */ #define HCI_DEV_REG 1 #define HCI_DEV_UNREG 2 #define HCI_DEV_UP 3 #define HCI_DEV_DOWN 4 #define HCI_DEV_SUSPEND 5 #define HCI_DEV_RESUME 6 #define HCI_DEV_OPEN 7 #define HCI_DEV_CLOSE 8 #define HCI_DEV_SETUP 9 /* HCI notify events */ #define HCI_NOTIFY_CONN_ADD 1 #define HCI_NOTIFY_CONN_DEL 2 #define HCI_NOTIFY_VOICE_SETTING 3 #define HCI_NOTIFY_ENABLE_SCO_CVSD 4 #define HCI_NOTIFY_ENABLE_SCO_TRANSP 5 #define HCI_NOTIFY_DISABLE_SCO 6 /* HCI bus types */ #define HCI_VIRTUAL 0 #define HCI_USB 1 #define HCI_PCCARD 2 #define HCI_UART 3 #define HCI_RS232 4 #define HCI_PCI 5 #define HCI_SDIO 6 #define HCI_SPI 7 #define HCI_I2C 8 #define HCI_SMD 9 #define HCI_VIRTIO 10 /* HCI controller types */ #define HCI_PRIMARY 0x00 #define HCI_AMP 0x01 /* First BR/EDR Controller shall have ID = 0 */ #define AMP_ID_BREDR 0x00 /* AMP controller types */ #define AMP_TYPE_BREDR 0x00 #define AMP_TYPE_80211 0x01 /* AMP controller status */ #define AMP_STATUS_POWERED_DOWN 0x00 #define AMP_STATUS_BLUETOOTH_ONLY 0x01 #define AMP_STATUS_NO_CAPACITY 0x02 #define AMP_STATUS_LOW_CAPACITY 0x03 #define AMP_STATUS_MEDIUM_CAPACITY 0x04 #define AMP_STATUS_HIGH_CAPACITY 0x05 #define AMP_STATUS_FULL_CAPACITY 0x06 /* HCI device quirks */ enum { /* When this quirk is set, the HCI Reset command is send when * closing the transport instead of when opening it. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_RESET_ON_CLOSE, /* When this quirk is set, the device is turned into a raw-only * device and it will stay in unconfigured state. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_RAW_DEVICE, /* When this quirk is set, the buffer sizes reported by * HCI Read Buffer Size command are corrected if invalid. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_FIXUP_BUFFER_SIZE, /* When this quirk is set, then a controller that does not * indicate support for Inquiry Result with RSSI is assumed to * support it anyway. Some early Bluetooth 1.2 controllers had * wrongly configured local features that will require forcing * them to enable this mode. Getting RSSI information with the * inquiry responses is preferred since it allows for a better * user experience. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_FIXUP_INQUIRY_MODE, /* When this quirk is set, then the HCI Read Local Supported * Commands command is not supported. In general Bluetooth 1.2 * and later controllers should support this command. However * some controllers indicate Bluetooth 1.2 support, but do * not support this command. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_LOCAL_COMMANDS, /* When this quirk is set, then no stored link key handling * is performed. This is mainly due to the fact that the * HCI Delete Stored Link Key command is advertised, but * not supported. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_STORED_LINK_KEY, /* When this quirk is set, an external configuration step * is required and will be indicated with the controller * configuration. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_EXTERNAL_CONFIG, /* When this quirk is set, the public Bluetooth address * initially reported by HCI Read BD Address command * is considered invalid. Controller configuration is * required before this device can be used. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_INVALID_BDADDR, /* When this quirk is set, the public Bluetooth address * initially reported by HCI Read BD Address command * is considered invalid. The public BD Address can be * specified in the fwnode property 'local-bd-address'. * If this property does not exist or is invalid controller * configuration is required before this device can be used. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_USE_BDADDR_PROPERTY, /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_STRICT_DUPLICATE_FILTER, /* When this quirk is set, LE scan and BR/EDR inquiry is done * simultaneously, otherwise it's interleaved. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_SIMULTANEOUS_DISCOVERY, /* When this quirk is set, the enabling of diagnostic mode is * not persistent over HCI Reset. Every time the controller * is brought up it needs to be reprogrammed. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_NON_PERSISTENT_DIAG, /* When this quirk is set, setup() would be run after every * open() and not just after the first open(). * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. * */ HCI_QUIRK_NON_PERSISTENT_SETUP, /* When this quirk is set, wide band speech is supported by * the driver since no reliable mechanism exist to report * this from the hardware, a driver flag is use to convey * this support * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, /* When this quirk is set, the controller has validated that * LE states reported through the HCI_LE_READ_SUPPORTED_STATES are * valid. This mechanism is necessary as many controllers have * been seen has having trouble initiating a connectable * advertisement despite the state combination being reported as * supported. */ HCI_QUIRK_VALID_LE_STATES, /* When this quirk is set, then erroneous data reporting * is ignored. This is mainly due to the fact that the HCI * Read Default Erroneous Data Reporting command is advertised, * but not supported; these controllers often reply with unknown * command and tend to lock up randomly. Needing a hard reset. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, /* * When this quirk is set, then the hci_suspend_notifier is not * registered. This is intended for devices which drop completely * from the bus on system-suspend and which will show up as a new * HCI after resume. */ HCI_QUIRK_NO_SUSPEND_NOTIFIER, /* * When this quirk is set, LE tx power is not queried on startup * and the min/max tx power values default to HCI_TX_POWER_INVALID. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, /* When this quirk is set, HCI_OP_SET_EVENT_FLT requests with * HCI_FLT_CLEAR_ALL are ignored and event filtering is * completely avoided. A subset of the CSR controller * clones struggle with this and instantly lock up. * * Note that devices using this must (separately) disable * runtime suspend, because event filtering takes place there. */ HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, /* * When this quirk is set, disables the use of * HCI_OP_ENHANCED_SETUP_SYNC_CONN command to setup SCO connections. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, /* * When this quirk is set, the HCI_OP_LE_SET_EXT_SCAN_ENABLE command is * disabled. This is required for some Broadcom controllers which * erroneously claim to support extended scanning. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_EXT_SCAN, /* * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is * disabled. This is required for some Broadcom controllers which * erroneously claim to support MWS Transport Layer Configuration. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, /* When this quirk is set, max_page for local extended features * is set to 1, even if controller reports higher number. Some * controllers (e.g. RTL8723CS) report more pages, but they * don't actually support features declared there. */ HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2, /* * When this quirk is set, the HCI_OP_LE_SET_RPA_TIMEOUT command is * skipped during initialization. This is required for the Actions * Semiconductor ATS2851 based controllers, which erroneously claims * to support it. */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, /* When this quirk is set, MSFT extension monitor tracking by * address filter is supported. Since tracking quantity of each * pattern is limited, this feature supports tracking multiple * devices concurrently if controller supports multiple * address filters. * * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, /* * When this quirk is set, LE Coded PHY shall not be used. This is * required for some Intel controllers which erroneously claim to * support it but it causes problems with extended scanning. * * This quirk can be set before hci_register_dev is called or * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_LE_CODED, }; /* HCI device flags */ enum { HCI_UP, HCI_INIT, HCI_RUNNING, HCI_PSCAN, HCI_ISCAN, HCI_AUTH, HCI_ENCRYPT, HCI_INQUIRY, HCI_RAW, HCI_RESET, }; /* HCI socket flags */ enum { HCI_SOCK_TRUSTED, HCI_MGMT_INDEX_EVENTS, HCI_MGMT_UNCONF_INDEX_EVENTS, HCI_MGMT_EXT_INDEX_EVENTS, HCI_MGMT_EXT_INFO_EVENTS, HCI_MGMT_OPTION_EVENTS, HCI_MGMT_SETTING_EVENTS, HCI_MGMT_DEV_CLASS_EVENTS, HCI_MGMT_LOCAL_NAME_EVENTS, HCI_MGMT_OOB_DATA_EVENTS, HCI_MGMT_EXP_FEATURE_EVENTS, }; /* * BR/EDR and/or LE controller flags: the flags defined here should represent * states from the controller. */ enum { HCI_SETUP, HCI_CONFIG, HCI_DEBUGFS_CREATED, HCI_AUTO_OFF, HCI_RFKILLED, HCI_MGMT, HCI_BONDABLE, HCI_SERVICE_CACHE, HCI_KEEP_DEBUG_KEYS, HCI_USE_DEBUG_KEYS, HCI_UNREGISTER, HCI_UNCONFIGURED, HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, HCI_SC_ENABLED, HCI_SC_ONLY, HCI_PRIVACY, HCI_LIMITED_PRIVACY, HCI_RPA_EXPIRED, HCI_RPA_RESOLVING, HCI_HS_ENABLED, HCI_LE_ENABLED, HCI_ADVERTISING, HCI_ADVERTISING_CONNECTABLE, HCI_CONNECTABLE, HCI_DISCOVERABLE, HCI_LIMITED_DISCOVERABLE, HCI_LINK_SECURITY, HCI_PERIODIC_INQ, HCI_FAST_CONNECTABLE, HCI_BREDR_ENABLED, HCI_LE_SCAN_INTERRUPTED, HCI_WIDEBAND_SPEECH_ENABLED, HCI_EVENT_FILTER_CONFIGURED, HCI_PA_SYNC, HCI_DUT_MODE, HCI_VENDOR_DIAG, HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, HCI_ENABLE_LL_PRIVACY, HCI_CMD_PENDING, HCI_FORCE_NO_MITM, HCI_QUALITY_REPORT, HCI_OFFLOAD_CODECS_ENABLED, HCI_LE_SIMULTANEOUS_ROLES, HCI_CMD_DRAIN_WORKQUEUE, HCI_MESH_EXPERIMENTAL, HCI_MESH, HCI_MESH_SENDING, __HCI_NUM_FLAGS, }; /* HCI timeouts */ #define HCI_DISCONN_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_PAIRING_TIMEOUT msecs_to_jiffies(60000) /* 60 seconds */ #define HCI_INIT_TIMEOUT msecs_to_jiffies(10000) /* 10 seconds */ #define HCI_CMD_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_NCMD_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ #define HCI_ACL_TX_TIMEOUT msecs_to_jiffies(45000) /* 45 seconds */ #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 #define HCI_ACLDATA_PKT 0x02 #define HCI_SCODATA_PKT 0x03 #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ #define HCI_DM1 0x0008 #define HCI_DM3 0x0400 #define HCI_DM5 0x4000 #define HCI_DH1 0x0010 #define HCI_DH3 0x0800 #define HCI_DH5 0x8000 /* HCI packet types inverted masks */ #define HCI_2DH1 0x0002 #define HCI_3DH1 0x0004 #define HCI_2DH3 0x0100 #define HCI_3DH3 0x0200 #define HCI_2DH5 0x1000 #define HCI_3DH5 0x2000 #define HCI_HV1 0x0020 #define HCI_HV2 0x0040 #define HCI_HV3 0x0080 #define SCO_PTYPE_MASK (HCI_HV1 | HCI_HV2 | HCI_HV3) #define ACL_PTYPE_MASK (~SCO_PTYPE_MASK) /* eSCO packet types */ #define ESCO_HV1 0x0001 #define ESCO_HV2 0x0002 #define ESCO_HV3 0x0004 #define ESCO_EV3 0x0008 #define ESCO_EV4 0x0010 #define ESCO_EV5 0x0020 #define ESCO_2EV3 0x0040 #define ESCO_3EV3 0x0080 #define ESCO_2EV5 0x0100 #define ESCO_3EV5 0x0200 #define SCO_ESCO_MASK (ESCO_HV1 | ESCO_HV2 | ESCO_HV3) #define EDR_ESCO_MASK (ESCO_2EV3 | ESCO_3EV3 | ESCO_2EV5 | ESCO_3EV5) /* ACL flags */ #define ACL_START_NO_FLUSH 0x00 #define ACL_CONT 0x01 #define ACL_START 0x02 #define ACL_COMPLETE 0x03 #define ACL_ACTIVE_BCAST 0x04 #define ACL_PICO_BCAST 0x08 /* ISO PB flags */ #define ISO_START 0x00 #define ISO_CONT 0x01 #define ISO_SINGLE 0x02 #define ISO_END 0x03 /* ISO TS flags */ #define ISO_TS 0x01 /* Baseband links */ #define SCO_LINK 0x00 #define ACL_LINK 0x01 #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 #define AMP_LINK 0x81 #define ISO_LINK 0x82 #define INVALID_LINK 0xff /* LMP features */ #define LMP_3SLOT 0x01 #define LMP_5SLOT 0x02 #define LMP_ENCRYPT 0x04 #define LMP_SOFFSET 0x08 #define LMP_TACCURACY 0x10 #define LMP_RSWITCH 0x20 #define LMP_HOLD 0x40 #define LMP_SNIFF 0x80 #define LMP_PARK 0x01 #define LMP_RSSI 0x02 #define LMP_QUALITY 0x04 #define LMP_SCO 0x08 #define LMP_HV2 0x10 #define LMP_HV3 0x20 #define LMP_ULAW 0x40 #define LMP_ALAW 0x80 #define LMP_CVSD 0x01 #define LMP_PSCHEME 0x02 #define LMP_PCONTROL 0x04 #define LMP_TRANSPARENT 0x08 #define LMP_EDR_2M 0x02 #define LMP_EDR_3M 0x04 #define LMP_RSSI_INQ 0x40 #define LMP_ESCO 0x80 #define LMP_EV4 0x01 #define LMP_EV5 0x02 #define LMP_NO_BREDR 0x20 #define LMP_LE 0x40 #define LMP_EDR_3SLOT 0x80 #define LMP_EDR_5SLOT 0x01 #define LMP_SNIFF_SUBR 0x02 #define LMP_PAUSE_ENC 0x04 #define LMP_EDR_ESCO_2M 0x20 #define LMP_EDR_ESCO_3M 0x40 #define LMP_EDR_3S_ESCO 0x80 #define LMP_EXT_INQ 0x01 #define LMP_SIMUL_LE_BR 0x02 #define LMP_SIMPLE_PAIR 0x08 #define LMP_ERR_DATA_REPORTING 0x20 #define LMP_NO_FLUSH 0x40 #define LMP_LSTO 0x01 #define LMP_INQ_TX_PWR 0x02 #define LMP_EXTFEATURES 0x80 /* Extended LMP features */ #define LMP_CPB_CENTRAL 0x01 #define LMP_CPB_PERIPHERAL 0x02 #define LMP_SYNC_TRAIN 0x04 #define LMP_SYNC_SCAN 0x08 #define LMP_SC 0x01 #define LMP_PING 0x02 /* Host features */ #define LMP_HOST_SSP 0x01 #define LMP_HOST_LE 0x02 #define LMP_HOST_LE_BREDR 0x04 #define LMP_HOST_SC 0x08 /* LE features */ #define HCI_LE_ENCRYPTION 0x01 #define HCI_LE_CONN_PARAM_REQ_PROC 0x02 #define HCI_LE_PERIPHERAL_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_LL_PRIVACY 0x40 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_PERIODIC_ADV 0x20 #define HCI_LE_CHAN_SEL_ALG2 0x40 #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 #define HCI_LE_ISO_SYNC_RECEIVER 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 #define HCI_CM_HOLD 0x0001 #define HCI_CM_SNIFF 0x0002 #define HCI_CM_PARK 0x0003 /* Link policies */ #define HCI_LP_RSWITCH 0x0001 #define HCI_LP_HOLD 0x0002 #define HCI_LP_SNIFF 0x0004 #define HCI_LP_PARK 0x0008 /* Link modes */ #define HCI_LM_ACCEPT 0x8000 #define HCI_LM_MASTER 0x0001 #define HCI_LM_AUTH 0x0002 #define HCI_LM_ENCRYPT 0x0004 #define HCI_LM_TRUSTED 0x0008 #define HCI_LM_RELIABLE 0x0010 #define HCI_LM_SECURE 0x0020 #define HCI_LM_FIPS 0x0040 /* Authentication types */ #define HCI_AT_NO_BONDING 0x00 #define HCI_AT_NO_BONDING_MITM 0x01 #define HCI_AT_DEDICATED_BONDING 0x02 #define HCI_AT_DEDICATED_BONDING_MITM 0x03 #define HCI_AT_GENERAL_BONDING 0x04 #define HCI_AT_GENERAL_BONDING_MITM 0x05 /* I/O capabilities */ #define HCI_IO_DISPLAY_ONLY 0x00 #define HCI_IO_DISPLAY_YESNO 0x01 #define HCI_IO_KEYBOARD_ONLY 0x02 #define HCI_IO_NO_INPUT_OUTPUT 0x03 /* Link Key types */ #define HCI_LK_COMBINATION 0x00 #define HCI_LK_LOCAL_UNIT 0x01 #define HCI_LK_REMOTE_UNIT 0x02 #define HCI_LK_DEBUG_COMBINATION 0x03 #define HCI_LK_UNAUTH_COMBINATION_P192 0x04 #define HCI_LK_AUTH_COMBINATION_P192 0x05 #define HCI_LK_CHANGED_COMBINATION 0x06 #define HCI_LK_UNAUTH_COMBINATION_P256 0x07 #define HCI_LK_AUTH_COMBINATION_P256 0x08 /* ---- HCI Error Codes ---- */ #define HCI_ERROR_UNKNOWN_CONN_ID 0x02 #define HCI_ERROR_AUTH_FAILURE 0x05 #define HCI_ERROR_PIN_OR_KEY_MISSING 0x06 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f #define HCI_ERROR_INVALID_PARAMETERS 0x12 #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c #define HCI_ERROR_CANCELLED_BY_HOST 0x44 /* Flow control modes */ #define HCI_FLOW_CTL_MODE_PACKET_BASED 0x00 #define HCI_FLOW_CTL_MODE_BLOCK_BASED 0x01 /* The core spec defines 127 as the "not available" value */ #define HCI_TX_POWER_INVALID 127 #define HCI_RSSI_INVALID 127 #define HCI_SYNC_HANDLE_INVALID 0xffff #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 /* Extended Inquiry Response field types */ #define EIR_FLAGS 0x01 /* flags */ #define EIR_UUID16_SOME 0x02 /* 16-bit UUID, more available */ #define EIR_UUID16_ALL 0x03 /* 16-bit UUID, all listed */ #define EIR_UUID32_SOME 0x04 /* 32-bit UUID, more available */ #define EIR_UUID32_ALL 0x05 /* 32-bit UUID, all listed */ #define EIR_UUID128_SOME 0x06 /* 128-bit UUID, more available */ #define EIR_UUID128_ALL 0x07 /* 128-bit UUID, all listed */ #define EIR_NAME_SHORT 0x08 /* shortened local name */ #define EIR_NAME_COMPLETE 0x09 /* complete local name */ #define EIR_TX_POWER 0x0A /* transmit power level */ #define EIR_CLASS_OF_DEV 0x0D /* Class of Device */ #define EIR_SSP_HASH_C192 0x0E /* Simple Pairing Hash C-192 */ #define EIR_SSP_RAND_R192 0x0F /* Simple Pairing Randomizer R-192 */ #define EIR_DEVICE_ID 0x10 /* device ID */ #define EIR_APPEARANCE 0x19 /* Device appearance */ #define EIR_SERVICE_DATA 0x16 /* Service Data */ #define EIR_LE_BDADDR 0x1B /* LE Bluetooth device address */ #define EIR_LE_ROLE 0x1C /* LE role */ #define EIR_SSP_HASH_C256 0x1D /* Simple Pairing Hash C-256 */ #define EIR_SSP_RAND_R256 0x1E /* Simple Pairing Rand R-256 */ #define EIR_LE_SC_CONFIRM 0x22 /* LE SC Confirmation Value */ #define EIR_LE_SC_RANDOM 0x23 /* LE SC Random Value */ /* Low Energy Advertising Flags */ #define LE_AD_LIMITED 0x01 /* Limited Discoverable */ #define LE_AD_GENERAL 0x02 /* General Discoverable */ #define LE_AD_NO_BREDR 0x04 /* BR/EDR not supported */ #define LE_AD_SIM_LE_BREDR_CTRL 0x08 /* Simultaneous LE & BR/EDR Controller */ #define LE_AD_SIM_LE_BREDR_HOST 0x10 /* Simultaneous LE & BR/EDR Host */ /* ----- HCI Commands ---- */ #define HCI_OP_NOP 0x0000 #define HCI_OP_INQUIRY 0x0401 struct hci_cp_inquiry { __u8 lap[3]; __u8 length; __u8 num_rsp; } __packed; #define HCI_OP_INQUIRY_CANCEL 0x0402 #define HCI_OP_PERIODIC_INQ 0x0403 #define HCI_OP_EXIT_PERIODIC_INQ 0x0404 #define HCI_OP_CREATE_CONN 0x0405 struct hci_cp_create_conn { bdaddr_t bdaddr; __le16 pkt_type; __u8 pscan_rep_mode; __u8 pscan_mode; __le16 clock_offset; __u8 role_switch; } __packed; #define HCI_OP_DISCONNECT 0x0406 struct hci_cp_disconnect { __le16 handle; __u8 reason; } __packed; #define HCI_OP_ADD_SCO 0x0407 struct hci_cp_add_sco { __le16 handle; __le16 pkt_type; } __packed; #define HCI_OP_CREATE_CONN_CANCEL 0x0408 struct hci_cp_create_conn_cancel { bdaddr_t bdaddr; } __packed; #define HCI_OP_ACCEPT_CONN_REQ 0x0409 struct hci_cp_accept_conn_req { bdaddr_t bdaddr; __u8 role; } __packed; #define HCI_OP_REJECT_CONN_REQ 0x040a struct hci_cp_reject_conn_req { bdaddr_t bdaddr; __u8 reason; } __packed; #define HCI_OP_LINK_KEY_REPLY 0x040b struct hci_cp_link_key_reply { bdaddr_t bdaddr; __u8 link_key[HCI_LINK_KEY_SIZE]; } __packed; #define HCI_OP_LINK_KEY_NEG_REPLY 0x040c struct hci_cp_link_key_neg_reply { bdaddr_t bdaddr; } __packed; #define HCI_OP_PIN_CODE_REPLY 0x040d struct hci_cp_pin_code_reply { bdaddr_t bdaddr; __u8 pin_len; __u8 pin_code[16]; } __packed; struct hci_rp_pin_code_reply { __u8 status; bdaddr_t bdaddr; } __packed; #define HCI_OP_PIN_CODE_NEG_REPLY 0x040e struct hci_cp_pin_code_neg_reply { bdaddr_t bdaddr; } __packed; struct hci_rp_pin_code_neg_reply { __u8 status; bdaddr_t bdaddr; } __packed; #define HCI_OP_CHANGE_CONN_PTYPE 0x040f struct hci_cp_change_conn_ptype { __le16 handle; __le16 pkt_type; } __packed; #define HCI_OP_AUTH_REQUESTED 0x0411 struct hci_cp_auth_requested { __le16 handle; } __packed; #define HCI_OP_SET_CONN_ENCRYPT 0x0413 struct hci_cp_set_conn_encrypt { __le16 handle; __u8 encrypt; } __packed; #define HCI_OP_CHANGE_CONN_LINK_KEY 0x0415 struct hci_cp_change_conn_link_key { __le16 handle; } __packed; #define HCI_OP_REMOTE_NAME_REQ 0x0419 struct hci_cp_remote_name_req { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_mode; __le16 clock_offset; } __packed; #define HCI_OP_REMOTE_NAME_REQ_CANCEL 0x041a struct hci_cp_remote_name_req_cancel { bdaddr_t bdaddr; } __packed; #define HCI_OP_READ_REMOTE_FEATURES 0x041b struct hci_cp_read_remote_features { __le16 handle; } __packed; #define HCI_OP_READ_REMOTE_EXT_FEATURES 0x041c struct hci_cp_read_remote_ext_features { __le16 handle; __u8 page; } __packed; #define HCI_OP_READ_REMOTE_VERSION 0x041d struct hci_cp_read_remote_version { __le16 handle; } __packed; #define HCI_OP_READ_CLOCK_OFFSET 0x041f struct hci_cp_read_clock_offset { __le16 handle; } __packed; #define HCI_OP_SETUP_SYNC_CONN 0x0428 struct hci_cp_setup_sync_conn { __le16 handle; __le32 tx_bandwidth; __le32 rx_bandwidth; __le16 max_latency; __le16 voice_setting; __u8 retrans_effort; __le16 pkt_type; } __packed; #define HCI_OP_ACCEPT_SYNC_CONN_REQ 0x0429 struct hci_cp_accept_sync_conn_req { bdaddr_t bdaddr; __le32 tx_bandwidth; __le32 rx_bandwidth; __le16 max_latency; __le16 content_format; __u8 retrans_effort; __le16 pkt_type; } __packed; #define HCI_OP_REJECT_SYNC_CONN_REQ 0x042a struct hci_cp_reject_sync_conn_req { bdaddr_t bdaddr; __u8 reason; } __packed; #define HCI_OP_IO_CAPABILITY_REPLY 0x042b struct hci_cp_io_capability_reply { bdaddr_t bdaddr; __u8 capability; __u8 oob_data; __u8 authentication; } __packed; #define HCI_OP_USER_CONFIRM_REPLY 0x042c struct hci_cp_user_confirm_reply { bdaddr_t bdaddr; } __packed; struct hci_rp_user_confirm_reply { __u8 status; bdaddr_t bdaddr; } __packed; #define HCI_OP_USER_CONFIRM_NEG_REPLY 0x042d #define HCI_OP_USER_PASSKEY_REPLY 0x042e struct hci_cp_user_passkey_reply { bdaddr_t bdaddr; __le32 passkey; } __packed; #define HCI_OP_USER_PASSKEY_NEG_REPLY 0x042f #define HCI_OP_REMOTE_OOB_DATA_REPLY 0x0430 struct hci_cp_remote_oob_data_reply { bdaddr_t bdaddr; __u8 hash[16]; __u8 rand[16]; } __packed; #define HCI_OP_REMOTE_OOB_DATA_NEG_REPLY 0x0433 struct hci_cp_remote_oob_data_neg_reply { bdaddr_t bdaddr; } __packed; #define HCI_OP_IO_CAPABILITY_NEG_REPLY 0x0434 struct hci_cp_io_capability_neg_reply { bdaddr_t bdaddr; __u8 reason; } __packed; #define HCI_OP_CREATE_PHY_LINK 0x0435 struct hci_cp_create_phy_link { __u8 phy_handle; __u8 key_len; __u8 key_type; __u8 key[HCI_AMP_LINK_KEY_SIZE]; } __packed; #define HCI_OP_ACCEPT_PHY_LINK 0x0436 struct hci_cp_accept_phy_link { __u8 phy_handle; __u8 key_len; __u8 key_type; __u8 key[HCI_AMP_LINK_KEY_SIZE]; } __packed; #define HCI_OP_DISCONN_PHY_LINK 0x0437 struct hci_cp_disconn_phy_link { __u8 phy_handle; __u8 reason; } __packed; struct ext_flow_spec { __u8 id; __u8 stype; __le16 msdu; __le32 sdu_itime; __le32 acc_lat; __le32 flush_to; } __packed; #define HCI_OP_CREATE_LOGICAL_LINK 0x0438 #define HCI_OP_ACCEPT_LOGICAL_LINK 0x0439 struct hci_cp_create_accept_logical_link { __u8 phy_handle; struct ext_flow_spec tx_flow_spec; struct ext_flow_spec rx_flow_spec; } __packed; #define HCI_OP_DISCONN_LOGICAL_LINK 0x043a struct hci_cp_disconn_logical_link { __le16 log_handle; } __packed; #define HCI_OP_LOGICAL_LINK_CANCEL 0x043b struct hci_cp_logical_link_cancel { __u8 phy_handle; __u8 flow_spec_id; } __packed; #define HCI_OP_ENHANCED_SETUP_SYNC_CONN 0x043d struct hci_coding_format { __u8 id; __le16 cid; __le16 vid; } __packed; struct hci_cp_enhanced_setup_sync_conn { __le16 handle; __le32 tx_bandwidth; __le32 rx_bandwidth; struct hci_coding_format tx_coding_format; struct hci_coding_format rx_coding_format; __le16 tx_codec_frame_size; __le16 rx_codec_frame_size; __le32 in_bandwidth; __le32 out_bandwidth; struct hci_coding_format in_coding_format; struct hci_coding_format out_coding_format; __le16 in_coded_data_size; __le16 out_coded_data_size; __u8 in_pcm_data_format; __u8 out_pcm_data_format; __u8 in_pcm_sample_payload_msb_pos; __u8 out_pcm_sample_payload_msb_pos; __u8 in_data_path; __u8 out_data_path; __u8 in_transport_unit_size; __u8 out_transport_unit_size; __le16 max_latency; __le16 pkt_type; __u8 retrans_effort; } __packed; struct hci_rp_logical_link_cancel { __u8 status; __u8 phy_handle; __u8 flow_spec_id; } __packed; #define HCI_OP_SET_CPB 0x0441 struct hci_cp_set_cpb { __u8 enable; __u8 lt_addr; __u8 lpo_allowed; __le16 packet_type; __le16 interval_min; __le16 interval_max; __le16 cpb_sv_tout; } __packed; struct hci_rp_set_cpb { __u8 status; __u8 lt_addr; __le16 interval; } __packed; #define HCI_OP_START_SYNC_TRAIN 0x0443 #define HCI_OP_REMOTE_OOB_EXT_DATA_REPLY 0x0445 struct hci_cp_remote_oob_ext_data_reply { bdaddr_t bdaddr; __u8 hash192[16]; __u8 rand192[16]; __u8 hash256[16]; __u8 rand256[16]; } __packed; #define HCI_OP_SNIFF_MODE 0x0803 struct hci_cp_sniff_mode { __le16 handle; __le16 max_interval; __le16 min_interval; __le16 attempt; __le16 timeout; } __packed; #define HCI_OP_EXIT_SNIFF_MODE 0x0804 struct hci_cp_exit_sniff_mode { __le16 handle; } __packed; #define HCI_OP_ROLE_DISCOVERY 0x0809 struct hci_cp_role_discovery { __le16 handle; } __packed; struct hci_rp_role_discovery { __u8 status; __le16 handle; __u8 role; } __packed; #define HCI_OP_SWITCH_ROLE 0x080b struct hci_cp_switch_role { bdaddr_t bdaddr; __u8 role; } __packed; #define HCI_OP_READ_LINK_POLICY 0x080c struct hci_cp_read_link_policy { __le16 handle; } __packed; struct hci_rp_read_link_policy { __u8 status; __le16 handle; __le16 policy; } __packed; #define HCI_OP_WRITE_LINK_POLICY 0x080d struct hci_cp_write_link_policy { __le16 handle; __le16 policy; } __packed; struct hci_rp_write_link_policy { __u8 status; __le16 handle; } __packed; #define HCI_OP_READ_DEF_LINK_POLICY 0x080e struct hci_rp_read_def_link_policy { __u8 status; __le16 policy; } __packed; #define HCI_OP_WRITE_DEF_LINK_POLICY 0x080f struct hci_cp_write_def_link_policy { __le16 policy; } __packed; #define HCI_OP_SNIFF_SUBRATE 0x0811 struct hci_cp_sniff_subrate { __le16 handle; __le16 max_latency; __le16 min_remote_timeout; __le16 min_local_timeout; } __packed; #define HCI_OP_SET_EVENT_MASK 0x0c01 #define HCI_OP_RESET 0x0c03 #define HCI_OP_SET_EVENT_FLT 0x0c05 #define HCI_SET_EVENT_FLT_SIZE 9 struct hci_cp_set_event_filter { __u8 flt_type; __u8 cond_type; struct { bdaddr_t bdaddr; __u8 auto_accept; } __packed addr_conn_flt; } __packed; /* Filter types */ #define HCI_FLT_CLEAR_ALL 0x00 #define HCI_FLT_INQ_RESULT 0x01 #define HCI_FLT_CONN_SETUP 0x02 /* CONN_SETUP Condition types */ #define HCI_CONN_SETUP_ALLOW_ALL 0x00 #define HCI_CONN_SETUP_ALLOW_CLASS 0x01 #define HCI_CONN_SETUP_ALLOW_BDADDR 0x02 /* CONN_SETUP Conditions */ #define HCI_CONN_SETUP_AUTO_OFF 0x01 #define HCI_CONN_SETUP_AUTO_ON 0x02 #define HCI_CONN_SETUP_AUTO_ON_WITH_RS 0x03 #define HCI_OP_READ_STORED_LINK_KEY 0x0c0d struct hci_cp_read_stored_link_key { bdaddr_t bdaddr; __u8 read_all; } __packed; struct hci_rp_read_stored_link_key { __u8 status; __le16 max_keys; __le16 num_keys; } __packed; #define HCI_OP_DELETE_STORED_LINK_KEY 0x0c12 struct hci_cp_delete_stored_link_key { bdaddr_t bdaddr; __u8 delete_all; } __packed; struct hci_rp_delete_stored_link_key { __u8 status; __le16 num_keys; } __packed; #define HCI_MAX_NAME_LENGTH 248 #define HCI_OP_WRITE_LOCAL_NAME 0x0c13 struct hci_cp_write_local_name { __u8 name[HCI_MAX_NAME_LENGTH]; } __packed; #define HCI_OP_READ_LOCAL_NAME 0x0c14 struct hci_rp_read_local_name { __u8 status; __u8 name[HCI_MAX_NAME_LENGTH]; } __packed; #define HCI_OP_WRITE_CA_TIMEOUT 0x0c16 #define HCI_OP_WRITE_PG_TIMEOUT 0x0c18 #define HCI_OP_WRITE_SCAN_ENABLE 0x0c1a #define SCAN_DISABLED 0x00 #define SCAN_INQUIRY 0x01 #define SCAN_PAGE 0x02 #define HCI_OP_READ_AUTH_ENABLE 0x0c1f #define HCI_OP_WRITE_AUTH_ENABLE 0x0c20 #define AUTH_DISABLED 0x00 #define AUTH_ENABLED 0x01 #define HCI_OP_READ_ENCRYPT_MODE 0x0c21 #define HCI_OP_WRITE_ENCRYPT_MODE 0x0c22 #define ENCRYPT_DISABLED 0x00 #define ENCRYPT_P2P 0x01 #define ENCRYPT_BOTH 0x02 #define HCI_OP_READ_CLASS_OF_DEV 0x0c23 struct hci_rp_read_class_of_dev { __u8 status; __u8 dev_class[3]; } __packed; #define HCI_OP_WRITE_CLASS_OF_DEV 0x0c24 struct hci_cp_write_class_of_dev { __u8 dev_class[3]; } __packed; #define HCI_OP_READ_VOICE_SETTING 0x0c25 struct hci_rp_read_voice_setting { __u8 status; __le16 voice_setting; } __packed; #define HCI_OP_WRITE_VOICE_SETTING 0x0c26 struct hci_cp_write_voice_setting { __le16 voice_setting; } __packed; #define HCI_OP_HOST_BUFFER_SIZE 0x0c33 struct hci_cp_host_buffer_size { __le16 acl_mtu; __u8 sco_mtu; __le16 acl_max_pkt; __le16 sco_max_pkt; } __packed; #define HCI_OP_READ_NUM_SUPPORTED_IAC 0x0c38 struct hci_rp_read_num_supported_iac { __u8 status; __u8 num_iac; } __packed; #define HCI_OP_READ_CURRENT_IAC_LAP 0x0c39 #define HCI_OP_WRITE_CURRENT_IAC_LAP 0x0c3a struct hci_cp_write_current_iac_lap { __u8 num_iac; __u8 iac_lap[6]; } __packed; #define HCI_OP_WRITE_INQUIRY_MODE 0x0c45 #define HCI_MAX_EIR_LENGTH 240 #define HCI_OP_WRITE_EIR 0x0c52 struct hci_cp_write_eir { __u8 fec; __u8 data[HCI_MAX_EIR_LENGTH]; } __packed; #define HCI_OP_READ_SSP_MODE 0x0c55 struct hci_rp_read_ssp_mode { __u8 status; __u8 mode; } __packed; #define HCI_OP_WRITE_SSP_MODE 0x0c56 struct hci_cp_write_ssp_mode { __u8 mode; } __packed; #define HCI_OP_READ_LOCAL_OOB_DATA 0x0c57 struct hci_rp_read_local_oob_data { __u8 status; __u8 hash[16]; __u8 rand[16]; } __packed; #define HCI_OP_READ_INQ_RSP_TX_POWER 0x0c58 struct hci_rp_read_inq_rsp_tx_power { __u8 status; __s8 tx_power; } __packed; #define HCI_OP_READ_DEF_ERR_DATA_REPORTING 0x0c5a #define ERR_DATA_REPORTING_DISABLED 0x00 #define ERR_DATA_REPORTING_ENABLED 0x01 struct hci_rp_read_def_err_data_reporting { __u8 status; __u8 err_data_reporting; } __packed; #define HCI_OP_WRITE_DEF_ERR_DATA_REPORTING 0x0c5b struct hci_cp_write_def_err_data_reporting { __u8 err_data_reporting; } __packed; #define HCI_OP_SET_EVENT_MASK_PAGE_2 0x0c63 #define HCI_OP_READ_LOCATION_DATA 0x0c64 #define HCI_OP_READ_FLOW_CONTROL_MODE 0x0c66 struct hci_rp_read_flow_control_mode { __u8 status; __u8 mode; } __packed; #define HCI_OP_WRITE_LE_HOST_SUPPORTED 0x0c6d struct hci_cp_write_le_host_supported { __u8 le; __u8 simul; } __packed; #define HCI_OP_SET_RESERVED_LT_ADDR 0x0c74 struct hci_cp_set_reserved_lt_addr { __u8 lt_addr; } __packed; struct hci_rp_set_reserved_lt_addr { __u8 status; __u8 lt_addr; } __packed; #define HCI_OP_DELETE_RESERVED_LT_ADDR 0x0c75 struct hci_cp_delete_reserved_lt_addr { __u8 lt_addr; } __packed; struct hci_rp_delete_reserved_lt_addr { __u8 status; __u8 lt_addr; } __packed; #define HCI_OP_SET_CPB_DATA 0x0c76 struct hci_cp_set_cpb_data { __u8 lt_addr; __u8 fragment; __u8 data_length; __u8 data[HCI_MAX_CPB_DATA_SIZE]; } __packed; struct hci_rp_set_cpb_data { __u8 status; __u8 lt_addr; } __packed; #define HCI_OP_READ_SYNC_TRAIN_PARAMS 0x0c77 #define HCI_OP_WRITE_SYNC_TRAIN_PARAMS 0x0c78 struct hci_cp_write_sync_train_params { __le16 interval_min; __le16 interval_max; __le32 sync_train_tout; __u8 service_data; } __packed; struct hci_rp_write_sync_train_params { __u8 status; __le16 sync_train_int; } __packed; #define HCI_OP_READ_SC_SUPPORT 0x0c79 struct hci_rp_read_sc_support { __u8 status; __u8 support; } __packed; #define HCI_OP_WRITE_SC_SUPPORT 0x0c7a struct hci_cp_write_sc_support { __u8 support; } __packed; #define HCI_OP_READ_AUTH_PAYLOAD_TO 0x0c7b struct hci_cp_read_auth_payload_to { __le16 handle; } __packed; struct hci_rp_read_auth_payload_to { __u8 status; __le16 handle; __le16 timeout; } __packed; #define HCI_OP_WRITE_AUTH_PAYLOAD_TO 0x0c7c struct hci_cp_write_auth_payload_to { __le16 handle; __le16 timeout; } __packed; struct hci_rp_write_auth_payload_to { __u8 status; __le16 handle; } __packed; #define HCI_OP_READ_LOCAL_OOB_EXT_DATA 0x0c7d struct hci_rp_read_local_oob_ext_data { __u8 status; __u8 hash192[16]; __u8 rand192[16]; __u8 hash256[16]; __u8 rand256[16]; } __packed; #define HCI_CONFIGURE_DATA_PATH 0x0c83 struct hci_op_configure_data_path { __u8 direction; __u8 data_path_id; __u8 vnd_len; __u8 vnd_data[]; } __packed; #define HCI_OP_READ_LOCAL_VERSION 0x1001 struct hci_rp_read_local_version { __u8 status; __u8 hci_ver; __le16 hci_rev; __u8 lmp_ver; __le16 manufacturer; __le16 lmp_subver; } __packed; #define HCI_OP_READ_LOCAL_COMMANDS 0x1002 struct hci_rp_read_local_commands { __u8 status; __u8 commands[64]; } __packed; #define HCI_OP_READ_LOCAL_FEATURES 0x1003 struct hci_rp_read_local_features { __u8 status; __u8 features[8]; } __packed; #define HCI_OP_READ_LOCAL_EXT_FEATURES 0x1004 struct hci_cp_read_local_ext_features { __u8 page; } __packed; struct hci_rp_read_local_ext_features { __u8 status; __u8 page; __u8 max_page; __u8 features[8]; } __packed; #define HCI_OP_READ_BUFFER_SIZE 0x1005 struct hci_rp_read_buffer_size { __u8 status; __le16 acl_mtu; __u8 sco_mtu; __le16 acl_max_pkt; __le16 sco_max_pkt; } __packed; #define HCI_OP_READ_BD_ADDR 0x1009 struct hci_rp_read_bd_addr { __u8 status; bdaddr_t bdaddr; } __packed; #define HCI_OP_READ_DATA_BLOCK_SIZE 0x100a struct hci_rp_read_data_block_size { __u8 status; __le16 max_acl_len; __le16 block_len; __le16 num_blocks; } __packed; #define HCI_OP_READ_LOCAL_CODECS 0x100b struct hci_std_codecs { __u8 num; __u8 codec[]; } __packed; struct hci_vnd_codec { /* company id */ __le16 cid; /* vendor codec id */ __le16 vid; } __packed; struct hci_vnd_codecs { __u8 num; struct hci_vnd_codec codec[]; } __packed; struct hci_rp_read_local_supported_codecs { __u8 status; struct hci_std_codecs std_codecs; struct hci_vnd_codecs vnd_codecs; } __packed; #define HCI_OP_READ_LOCAL_PAIRING_OPTS 0x100c struct hci_rp_read_local_pairing_opts { __u8 status; __u8 pairing_opts; __u8 max_key_size; } __packed; #define HCI_OP_READ_LOCAL_CODECS_V2 0x100d struct hci_std_codec_v2 { __u8 id; __u8 transport; } __packed; struct hci_std_codecs_v2 { __u8 num; struct hci_std_codec_v2 codec[]; } __packed; struct hci_vnd_codec_v2 { __le16 cid; __le16 vid; __u8 transport; } __packed; struct hci_vnd_codecs_v2 { __u8 num; struct hci_vnd_codec_v2 codec[]; } __packed; struct hci_rp_read_local_supported_codecs_v2 { __u8 status; struct hci_std_codecs_v2 std_codecs; struct hci_vnd_codecs_v2 vendor_codecs; } __packed; #define HCI_OP_READ_LOCAL_CODEC_CAPS 0x100e struct hci_op_read_local_codec_caps { __u8 id; __le16 cid; __le16 vid; __u8 transport; __u8 direction; } __packed; struct hci_codec_caps { __u8 len; __u8 data[]; } __packed; struct hci_rp_read_local_codec_caps { __u8 status; __u8 num_caps; } __packed; #define HCI_OP_READ_PAGE_SCAN_ACTIVITY 0x0c1b struct hci_rp_read_page_scan_activity { __u8 status; __le16 interval; __le16 window; } __packed; #define HCI_OP_WRITE_PAGE_SCAN_ACTIVITY 0x0c1c struct hci_cp_write_page_scan_activity { __le16 interval; __le16 window; } __packed; #define HCI_OP_READ_TX_POWER 0x0c2d struct hci_cp_read_tx_power { __le16 handle; __u8 type; } __packed; struct hci_rp_read_tx_power { __u8 status; __le16 handle; __s8 tx_power; } __packed; #define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46 struct hci_rp_read_page_scan_type { __u8 status; __u8 type; } __packed; #define HCI_OP_WRITE_PAGE_SCAN_TYPE 0x0c47 #define PAGE_SCAN_TYPE_STANDARD 0x00 #define PAGE_SCAN_TYPE_INTERLACED 0x01 #define HCI_OP_READ_RSSI 0x1405 struct hci_cp_read_rssi { __le16 handle; } __packed; struct hci_rp_read_rssi { __u8 status; __le16 handle; __s8 rssi; } __packed; #define HCI_OP_READ_CLOCK 0x1407 struct hci_cp_read_clock { __le16 handle; __u8 which; } __packed; struct hci_rp_read_clock { __u8 status; __le16 handle; __le32 clock; __le16 accuracy; } __packed; #define HCI_OP_READ_ENC_KEY_SIZE 0x1408 struct hci_cp_read_enc_key_size { __le16 handle; } __packed; struct hci_rp_read_enc_key_size { __u8 status; __le16 handle; __u8 key_size; } __packed; #define HCI_OP_READ_LOCAL_AMP_INFO 0x1409 struct hci_rp_read_local_amp_info { __u8 status; __u8 amp_status; __le32 total_bw; __le32 max_bw; __le32 min_latency; __le32 max_pdu; __u8 amp_type; __le16 pal_cap; __le16 max_assoc_size; __le32 max_flush_to; __le32 be_flush_to; } __packed; #define HCI_OP_READ_LOCAL_AMP_ASSOC 0x140a struct hci_cp_read_local_amp_assoc { __u8 phy_handle; __le16 len_so_far; __le16 max_len; } __packed; struct hci_rp_read_local_amp_assoc { __u8 status; __u8 phy_handle; __le16 rem_len; __u8 frag[]; } __packed; #define HCI_OP_WRITE_REMOTE_AMP_ASSOC 0x140b struct hci_cp_write_remote_amp_assoc { __u8 phy_handle; __le16 len_so_far; __le16 rem_len; __u8 frag[]; } __packed; struct hci_rp_write_remote_amp_assoc { __u8 status; __u8 phy_handle; } __packed; #define HCI_OP_GET_MWS_TRANSPORT_CONFIG 0x140c #define HCI_OP_ENABLE_DUT_MODE 0x1803 #define HCI_OP_WRITE_SSP_DEBUG_MODE 0x1804 #define HCI_OP_LE_SET_EVENT_MASK 0x2001 struct hci_cp_le_set_event_mask { __u8 mask[8]; } __packed; #define HCI_OP_LE_READ_BUFFER_SIZE 0x2002 struct hci_rp_le_read_buffer_size { __u8 status; __le16 le_mtu; __u8 le_max_pkt; } __packed; #define HCI_OP_LE_READ_LOCAL_FEATURES 0x2003 struct hci_rp_le_read_local_features { __u8 status; __u8 features[8]; } __packed; #define HCI_OP_LE_SET_RANDOM_ADDR 0x2005 #define HCI_OP_LE_SET_ADV_PARAM 0x2006 struct hci_cp_le_set_adv_param { __le16 min_interval; __le16 max_interval; __u8 type; __u8 own_address_type; __u8 direct_addr_type; bdaddr_t direct_addr; __u8 channel_map; __u8 filter_policy; } __packed; #define HCI_OP_LE_READ_ADV_TX_POWER 0x2007 struct hci_rp_le_read_adv_tx_power { __u8 status; __s8 tx_power; } __packed; #define HCI_MAX_AD_LENGTH 31 #define HCI_OP_LE_SET_ADV_DATA 0x2008 struct hci_cp_le_set_adv_data { __u8 length; __u8 data[HCI_MAX_AD_LENGTH]; } __packed; #define HCI_OP_LE_SET_SCAN_RSP_DATA 0x2009 struct hci_cp_le_set_scan_rsp_data { __u8 length; __u8 data[HCI_MAX_AD_LENGTH]; } __packed; #define HCI_OP_LE_SET_ADV_ENABLE 0x200a #define LE_SCAN_PASSIVE 0x00 #define LE_SCAN_ACTIVE 0x01 #define HCI_OP_LE_SET_SCAN_PARAM 0x200b struct hci_cp_le_set_scan_param { __u8 type; __le16 interval; __le16 window; __u8 own_address_type; __u8 filter_policy; } __packed; #define LE_SCAN_DISABLE 0x00 #define LE_SCAN_ENABLE 0x01 #define LE_SCAN_FILTER_DUP_DISABLE 0x00 #define LE_SCAN_FILTER_DUP_ENABLE 0x01 #define HCI_OP_LE_SET_SCAN_ENABLE 0x200c struct hci_cp_le_set_scan_enable { __u8 enable; __u8 filter_dup; } __packed; #define HCI_LE_USE_PEER_ADDR 0x00 #define HCI_LE_USE_ACCEPT_LIST 0x01 #define HCI_OP_LE_CREATE_CONN 0x200d struct hci_cp_le_create_conn { __le16 scan_interval; __le16 scan_window; __u8 filter_policy; __u8 peer_addr_type; bdaddr_t peer_addr; __u8 own_address_type; __le16 conn_interval_min; __le16 conn_interval_max; __le16 conn_latency; __le16 supervision_timeout; __le16 min_ce_len; __le16 max_ce_len; } __packed; #define HCI_OP_LE_CREATE_CONN_CANCEL 0x200e #define HCI_OP_LE_READ_ACCEPT_LIST_SIZE 0x200f struct hci_rp_le_read_accept_list_size { __u8 status; __u8 size; } __packed; #define HCI_OP_LE_CLEAR_ACCEPT_LIST 0x2010 #define HCI_OP_LE_ADD_TO_ACCEPT_LIST 0x2011 struct hci_cp_le_add_to_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; #define HCI_OP_LE_DEL_FROM_ACCEPT_LIST 0x2012 struct hci_cp_le_del_from_accept_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; #define HCI_OP_LE_CONN_UPDATE 0x2013 struct hci_cp_le_conn_update { __le16 handle; __le16 conn_interval_min; __le16 conn_interval_max; __le16 conn_latency; __le16 supervision_timeout; __le16 min_ce_len; __le16 max_ce_len; } __packed; #define HCI_OP_LE_READ_REMOTE_FEATURES 0x2016 struct hci_cp_le_read_remote_features { __le16 handle; } __packed; #define HCI_OP_LE_START_ENC 0x2019 struct hci_cp_le_start_enc { __le16 handle; __le64 rand; __le16 ediv; __u8 ltk[16]; } __packed; #define HCI_OP_LE_LTK_REPLY 0x201a struct hci_cp_le_ltk_reply { __le16 handle; __u8 ltk[16]; } __packed; struct hci_rp_le_ltk_reply { __u8 status; __le16 handle; } __packed; #define HCI_OP_LE_LTK_NEG_REPLY 0x201b struct hci_cp_le_ltk_neg_reply { __le16 handle; } __packed; struct hci_rp_le_ltk_neg_reply { __u8 status; __le16 handle; } __packed; #define HCI_OP_LE_READ_SUPPORTED_STATES 0x201c struct hci_rp_le_read_supported_states { __u8 status; __u8 le_states[8]; } __packed; #define HCI_OP_LE_CONN_PARAM_REQ_REPLY 0x2020 struct hci_cp_le_conn_param_req_reply { __le16 handle; __le16 interval_min; __le16 interval_max; __le16 latency; __le16 timeout; __le16 min_ce_len; __le16 max_ce_len; } __packed; #define HCI_OP_LE_CONN_PARAM_REQ_NEG_REPLY 0x2021 struct hci_cp_le_conn_param_req_neg_reply { __le16 handle; __u8 reason; } __packed; #define HCI_OP_LE_SET_DATA_LEN 0x2022 struct hci_cp_le_set_data_len { __le16 handle; __le16 tx_len; __le16 tx_time; } __packed; struct hci_rp_le_set_data_len { __u8 status; __le16 handle; } __packed; #define HCI_OP_LE_READ_DEF_DATA_LEN 0x2023 struct hci_rp_le_read_def_data_len { __u8 status; __le16 tx_len; __le16 tx_time; } __packed; #define HCI_OP_LE_WRITE_DEF_DATA_LEN 0x2024 struct hci_cp_le_write_def_data_len { __le16 tx_len; __le16 tx_time; } __packed; #define HCI_OP_LE_ADD_TO_RESOLV_LIST 0x2027 struct hci_cp_le_add_to_resolv_list { __u8 bdaddr_type; bdaddr_t bdaddr; __u8 peer_irk[16]; __u8 local_irk[16]; } __packed; #define HCI_OP_LE_DEL_FROM_RESOLV_LIST 0x2028 struct hci_cp_le_del_from_resolv_list { __u8 bdaddr_type; bdaddr_t bdaddr; } __packed; #define HCI_OP_LE_CLEAR_RESOLV_LIST 0x2029 #define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a struct hci_rp_le_read_resolv_list_size { __u8 status; __u8 size; } __packed; #define HCI_OP_LE_SET_ADDR_RESOLV_ENABLE 0x202d #define HCI_OP_LE_SET_RPA_TIMEOUT 0x202e #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; __le16 tx_len; __le16 tx_time; __le16 rx_len; __le16 rx_time; } __packed; #define HCI_OP_LE_SET_DEFAULT_PHY 0x2031 struct hci_cp_le_set_default_phy { __u8 all_phys; __u8 tx_phys; __u8 rx_phys; } __packed; #define HCI_LE_SET_PHY_1M 0x01 #define HCI_LE_SET_PHY_2M 0x02 #define HCI_LE_SET_PHY_CODED 0x04 #define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 struct hci_cp_le_set_ext_scan_params { __u8 own_addr_type; __u8 filter_policy; __u8 scanning_phys; __u8 data[]; } __packed; #define LE_SCAN_PHY_1M 0x01 #define LE_SCAN_PHY_2M 0x02 #define LE_SCAN_PHY_CODED 0x04 struct hci_cp_le_scan_phy_params { __u8 type; __le16 interval; __le16 window; } __packed; #define HCI_OP_LE_SET_EXT_SCAN_ENABLE 0x2042 struct hci_cp_le_set_ext_scan_enable { __u8 enable; __u8 filter_dup; __le16 duration; __le16 period; } __packed; #define HCI_OP_LE_EXT_CREATE_CONN 0x2043 struct hci_cp_le_ext_create_conn { __u8 filter_policy; __u8 own_addr_type; __u8 peer_addr_type; bdaddr_t peer_addr; __u8 phys; __u8 data[]; } __packed; struct hci_cp_le_ext_conn_param { __le16 scan_interval; __le16 scan_window; __le16 conn_interval_min; __le16 conn_interval_max; __le16 conn_latency; __le16 supervision_timeout; __le16 min_ce_len; __le16 max_ce_len; } __packed; #define HCI_OP_LE_PA_CREATE_SYNC 0x2044 struct hci_cp_le_pa_create_sync { __u8 options; __u8 sid; __u8 addr_type; bdaddr_t addr; __le16 skip; __le16 sync_timeout; __u8 sync_cte_type; } __packed; #define HCI_OP_LE_PA_TERM_SYNC 0x2046 struct hci_cp_le_pa_term_sync { __le16 handle; } __packed; #define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS 0x203b struct hci_rp_le_read_num_supported_adv_sets { __u8 status; __u8 num_of_sets; } __packed; #define HCI_OP_LE_SET_EXT_ADV_PARAMS 0x2036 struct hci_cp_le_set_ext_adv_params { __u8 handle; __le16 evt_properties; __u8 min_interval[3]; __u8 max_interval[3]; __u8 channel_map; __u8 own_addr_type; __u8 peer_addr_type; bdaddr_t peer_addr; __u8 filter_policy; __u8 tx_power; __u8 primary_phy; __u8 secondary_max_skip; __u8 secondary_phy; __u8 sid; __u8 notif_enable; } __packed; #define HCI_ADV_PHY_1M 0X01 #define HCI_ADV_PHY_2M 0x02 #define HCI_ADV_PHY_CODED 0x03 struct hci_rp_le_set_ext_adv_params { __u8 status; __u8 tx_power; } __packed; struct hci_cp_ext_adv_set { __u8 handle; __le16 duration; __u8 max_events; } __packed; #define HCI_MAX_EXT_AD_LENGTH 251 #define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 struct hci_cp_le_set_ext_adv_data { __u8 handle; __u8 operation; __u8 frag_pref; __u8 length; __u8 data[]; } __packed; #define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 struct hci_cp_le_set_ext_scan_rsp_data { __u8 handle; __u8 operation; __u8 frag_pref; __u8 length; __u8 data[]; } __packed; #define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 struct hci_cp_le_set_ext_adv_enable { __u8 enable; __u8 num_of_sets; __u8 data[]; } __packed; #define HCI_OP_LE_SET_PER_ADV_PARAMS 0x203e struct hci_cp_le_set_per_adv_params { __u8 handle; __le16 min_interval; __le16 max_interval; __le16 periodic_properties; } __packed; #define HCI_MAX_PER_AD_LENGTH 252 #define HCI_OP_LE_SET_PER_ADV_DATA 0x203f struct hci_cp_le_set_per_adv_data { __u8 handle; __u8 operation; __u8 length; __u8 data[]; } __packed; #define HCI_OP_LE_SET_PER_ADV_ENABLE 0x2040 struct hci_cp_le_set_per_adv_enable { __u8 enable; __u8 handle; } __packed; #define LE_SET_ADV_DATA_OP_COMPLETE 0x03 #define LE_SET_ADV_DATA_NO_FRAG 0x01 #define HCI_OP_LE_REMOVE_ADV_SET 0x203c #define HCI_OP_LE_CLEAR_ADV_SETS 0x203d #define HCI_OP_LE_SET_ADV_SET_RAND_ADDR 0x2035 struct hci_cp_le_set_adv_set_rand_addr { __u8 handle; bdaddr_t bdaddr; } __packed; #define HCI_OP_LE_READ_TRANSMIT_POWER 0x204b struct hci_rp_le_read_transmit_power { __u8 status; __s8 min_le_tx_power; __s8 max_le_tx_power; } __packed; #define HCI_NETWORK_PRIVACY 0x00 #define HCI_DEVICE_PRIVACY 0x01 #define HCI_OP_LE_SET_PRIVACY_MODE 0x204e struct hci_cp_le_set_privacy_mode { __u8 bdaddr_type; bdaddr_t bdaddr; __u8 mode; } __packed; #define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 struct hci_rp_le_read_buffer_size_v2 { __u8 status; __le16 acl_mtu; __u8 acl_max_pkt; __le16 iso_mtu; __u8 iso_max_pkt; } __packed; #define HCI_OP_LE_READ_ISO_TX_SYNC 0x2061 struct hci_cp_le_read_iso_tx_sync { __le16 handle; } __packed; struct hci_rp_le_read_iso_tx_sync { __u8 status; __le16 handle; __le16 seq; __le32 imestamp; __u8 offset[3]; } __packed; #define HCI_OP_LE_SET_CIG_PARAMS 0x2062 struct hci_cis_params { __u8 cis_id; __le16 c_sdu; __le16 p_sdu; __u8 c_phy; __u8 p_phy; __u8 c_rtn; __u8 p_rtn; } __packed; struct hci_cp_le_set_cig_params { __u8 cig_id; __u8 c_interval[3]; __u8 p_interval[3]; __u8 sca; __u8 packing; __u8 framing; __le16 c_latency; __le16 p_latency; __u8 num_cis; struct hci_cis_params cis[]; } __packed; struct hci_rp_le_set_cig_params { __u8 status; __u8 cig_id; __u8 num_handles; __le16 handle[]; } __packed; #define HCI_OP_LE_CREATE_CIS 0x2064 struct hci_cis { __le16 cis_handle; __le16 acl_handle; } __packed; struct hci_cp_le_create_cis { __u8 num_cis; struct hci_cis cis[]; } __packed; #define HCI_OP_LE_REMOVE_CIG 0x2065 struct hci_cp_le_remove_cig { __u8 cig_id; } __packed; #define HCI_OP_LE_ACCEPT_CIS 0x2066 struct hci_cp_le_accept_cis { __le16 handle; } __packed; #define HCI_OP_LE_REJECT_CIS 0x2067 struct hci_cp_le_reject_cis { __le16 handle; __u8 reason; } __packed; #define HCI_OP_LE_CREATE_BIG 0x2068 struct hci_bis { __u8 sdu_interval[3]; __le16 sdu; __le16 latency; __u8 rtn; __u8 phy; __u8 packing; __u8 framing; __u8 encryption; __u8 bcode[16]; } __packed; struct hci_cp_le_create_big { __u8 handle; __u8 adv_handle; __u8 num_bis; struct hci_bis bis; } __packed; #define HCI_OP_LE_TERM_BIG 0x206a struct hci_cp_le_term_big { __u8 handle; __u8 reason; } __packed; #define HCI_OP_LE_BIG_CREATE_SYNC 0x206b struct hci_cp_le_big_create_sync { __u8 handle; __le16 sync_handle; __u8 encryption; __u8 bcode[16]; __u8 mse; __le16 timeout; __u8 num_bis; __u8 bis[]; } __packed; #define HCI_OP_LE_BIG_TERM_SYNC 0x206c struct hci_cp_le_big_term_sync { __u8 handle; } __packed; #define HCI_OP_LE_SETUP_ISO_PATH 0x206e struct hci_cp_le_setup_iso_path { __le16 handle; __u8 direction; __u8 path; __u8 codec; __le16 codec_cid; __le16 codec_vid; __u8 delay[3]; __u8 codec_cfg_len; __u8 codec_cfg[]; } __packed; struct hci_rp_le_setup_iso_path { __u8 status; __le16 handle; } __packed; #define HCI_OP_LE_SET_HOST_FEATURE 0x2074 struct hci_cp_le_set_host_feature { __u8 bit_number; __u8 bit_value; } __packed; /* ---- HCI Events ---- */ struct hci_ev_status { __u8 status; } __packed; #define HCI_EV_INQUIRY_COMPLETE 0x01 #define HCI_EV_INQUIRY_RESULT 0x02 struct inquiry_info { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; __le16 clock_offset; } __packed; struct hci_ev_inquiry_result { __u8 num; struct inquiry_info info[]; }; #define HCI_EV_CONN_COMPLETE 0x03 struct hci_ev_conn_complete { __u8 status; __le16 handle; bdaddr_t bdaddr; __u8 link_type; __u8 encr_mode; } __packed; #define HCI_EV_CONN_REQUEST 0x04 struct hci_ev_conn_request { bdaddr_t bdaddr; __u8 dev_class[3]; __u8 link_type; } __packed; #define HCI_EV_DISCONN_COMPLETE 0x05 struct hci_ev_disconn_complete { __u8 status; __le16 handle; __u8 reason; } __packed; #define HCI_EV_AUTH_COMPLETE 0x06 struct hci_ev_auth_complete { __u8 status; __le16 handle; } __packed; #define HCI_EV_REMOTE_NAME 0x07 struct hci_ev_remote_name { __u8 status; bdaddr_t bdaddr; __u8 name[HCI_MAX_NAME_LENGTH]; } __packed; #define HCI_EV_ENCRYPT_CHANGE 0x08 struct hci_ev_encrypt_change { __u8 status; __le16 handle; __u8 encrypt; } __packed; #define HCI_EV_CHANGE_LINK_KEY_COMPLETE 0x09 struct hci_ev_change_link_key_complete { __u8 status; __le16 handle; } __packed; #define HCI_EV_REMOTE_FEATURES 0x0b struct hci_ev_remote_features { __u8 status; __le16 handle; __u8 features[8]; } __packed; #define HCI_EV_REMOTE_VERSION 0x0c struct hci_ev_remote_version { __u8 status; __le16 handle; __u8 lmp_ver; __le16 manufacturer; __le16 lmp_subver; } __packed; #define HCI_EV_QOS_SETUP_COMPLETE 0x0d struct hci_qos { __u8 service_type; __u32 token_rate; __u32 peak_bandwidth; __u32 latency; __u32 delay_variation; } __packed; struct hci_ev_qos_setup_complete { __u8 status; __le16 handle; struct hci_qos qos; } __packed; #define HCI_EV_CMD_COMPLETE 0x0e struct hci_ev_cmd_complete { __u8 ncmd; __le16 opcode; } __packed; #define HCI_EV_CMD_STATUS 0x0f struct hci_ev_cmd_status { __u8 status; __u8 ncmd; __le16 opcode; } __packed; #define HCI_EV_HARDWARE_ERROR 0x10 struct hci_ev_hardware_error { __u8 code; } __packed; #define HCI_EV_ROLE_CHANGE 0x12 struct hci_ev_role_change { __u8 status; bdaddr_t bdaddr; __u8 role; } __packed; #define HCI_EV_NUM_COMP_PKTS 0x13 struct hci_comp_pkts_info { __le16 handle; __le16 count; } __packed; struct hci_ev_num_comp_pkts { __u8 num; struct hci_comp_pkts_info handles[]; } __packed; #define HCI_EV_MODE_CHANGE 0x14 struct hci_ev_mode_change { __u8 status; __le16 handle; __u8 mode; __le16 interval; } __packed; #define HCI_EV_PIN_CODE_REQ 0x16 struct hci_ev_pin_code_req { bdaddr_t bdaddr; } __packed; #define HCI_EV_LINK_KEY_REQ 0x17 struct hci_ev_link_key_req { bdaddr_t bdaddr; } __packed; #define HCI_EV_LINK_KEY_NOTIFY 0x18 struct hci_ev_link_key_notify { bdaddr_t bdaddr; __u8 link_key[HCI_LINK_KEY_SIZE]; __u8 key_type; } __packed; #define HCI_EV_CLOCK_OFFSET 0x1c struct hci_ev_clock_offset { __u8 status; __le16 handle; __le16 clock_offset; } __packed; #define HCI_EV_PKT_TYPE_CHANGE 0x1d struct hci_ev_pkt_type_change { __u8 status; __le16 handle; __le16 pkt_type; } __packed; #define HCI_EV_PSCAN_REP_MODE 0x20 struct hci_ev_pscan_rep_mode { bdaddr_t bdaddr; __u8 pscan_rep_mode; } __packed; #define HCI_EV_INQUIRY_RESULT_WITH_RSSI 0x22 struct inquiry_info_rssi { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 dev_class[3]; __le16 clock_offset; __s8 rssi; } __packed; struct inquiry_info_rssi_pscan { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; __le16 clock_offset; __s8 rssi; } __packed; struct hci_ev_inquiry_result_rssi { __u8 num; __u8 data[]; } __packed; #define HCI_EV_REMOTE_EXT_FEATURES 0x23 struct hci_ev_remote_ext_features { __u8 status; __le16 handle; __u8 page; __u8 max_page; __u8 features[8]; } __packed; #define HCI_EV_SYNC_CONN_COMPLETE 0x2c struct hci_ev_sync_conn_complete { __u8 status; __le16 handle; bdaddr_t bdaddr; __u8 link_type; __u8 tx_interval; __u8 retrans_window; __le16 rx_pkt_len; __le16 tx_pkt_len; __u8 air_mode; } __packed; #define HCI_EV_SYNC_CONN_CHANGED 0x2d struct hci_ev_sync_conn_changed { __u8 status; __le16 handle; __u8 tx_interval; __u8 retrans_window; __le16 rx_pkt_len; __le16 tx_pkt_len; } __packed; #define HCI_EV_SNIFF_SUBRATE 0x2e struct hci_ev_sniff_subrate { __u8 status; __le16 handle; __le16 max_tx_latency; __le16 max_rx_latency; __le16 max_remote_timeout; __le16 max_local_timeout; } __packed; #define HCI_EV_EXTENDED_INQUIRY_RESULT 0x2f struct extended_inquiry_info { bdaddr_t bdaddr; __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 dev_class[3]; __le16 clock_offset; __s8 rssi; __u8 data[240]; } __packed; struct hci_ev_ext_inquiry_result { __u8 num; struct extended_inquiry_info info[]; } __packed; #define HCI_EV_KEY_REFRESH_COMPLETE 0x30 struct hci_ev_key_refresh_complete { __u8 status; __le16 handle; } __packed; #define HCI_EV_IO_CAPA_REQUEST 0x31 struct hci_ev_io_capa_request { bdaddr_t bdaddr; } __packed; #define HCI_EV_IO_CAPA_REPLY 0x32 struct hci_ev_io_capa_reply { bdaddr_t bdaddr; __u8 capability; __u8 oob_data; __u8 authentication; } __packed; #define HCI_EV_USER_CONFIRM_REQUEST 0x33 struct hci_ev_user_confirm_req { bdaddr_t bdaddr; __le32 passkey; } __packed; #define HCI_EV_USER_PASSKEY_REQUEST 0x34 struct hci_ev_user_passkey_req { bdaddr_t bdaddr; } __packed; #define HCI_EV_REMOTE_OOB_DATA_REQUEST 0x35 struct hci_ev_remote_oob_data_request { bdaddr_t bdaddr; } __packed; #define HCI_EV_SIMPLE_PAIR_COMPLETE 0x36 struct hci_ev_simple_pair_complete { __u8 status; bdaddr_t bdaddr; } __packed; #define HCI_EV_USER_PASSKEY_NOTIFY 0x3b struct hci_ev_user_passkey_notify { bdaddr_t bdaddr; __le32 passkey; } __packed; #define HCI_KEYPRESS_STARTED 0 #define HCI_KEYPRESS_ENTERED 1 #define HCI_KEYPRESS_ERASED 2 #define HCI_KEYPRESS_CLEARED 3 #define HCI_KEYPRESS_COMPLETED 4 #define HCI_EV_KEYPRESS_NOTIFY 0x3c struct hci_ev_keypress_notify { bdaddr_t bdaddr; __u8 type; } __packed; #define HCI_EV_REMOTE_HOST_FEATURES 0x3d struct hci_ev_remote_host_features { bdaddr_t bdaddr; __u8 features[8]; } __packed; #define HCI_EV_LE_META 0x3e struct hci_ev_le_meta { __u8 subevent; } __packed; #define HCI_EV_PHY_LINK_COMPLETE 0x40 struct hci_ev_phy_link_complete { __u8 status; __u8 phy_handle; } __packed; #define HCI_EV_CHANNEL_SELECTED 0x41 struct hci_ev_channel_selected { __u8 phy_handle; } __packed; #define HCI_EV_DISCONN_PHY_LINK_COMPLETE 0x42 struct hci_ev_disconn_phy_link_complete { __u8 status; __u8 phy_handle; __u8 reason; } __packed; #define HCI_EV_LOGICAL_LINK_COMPLETE 0x45 struct hci_ev_logical_link_complete { __u8 status; __le16 handle; __u8 phy_handle; __u8 flow_spec_id; } __packed; #define HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE 0x46 struct hci_ev_disconn_logical_link_complete { __u8 status; __le16 handle; __u8 reason; } __packed; #define HCI_EV_NUM_COMP_BLOCKS 0x48 struct hci_comp_blocks_info { __le16 handle; __le16 pkts; __le16 blocks; } __packed; struct hci_ev_num_comp_blocks { __le16 num_blocks; __u8 num_hndl; struct hci_comp_blocks_info handles[]; } __packed; #define HCI_EV_SYNC_TRAIN_COMPLETE 0x4F struct hci_ev_sync_train_complete { __u8 status; } __packed; #define HCI_EV_PERIPHERAL_PAGE_RESP_TIMEOUT 0x54 #define HCI_EV_LE_CONN_COMPLETE 0x01 struct hci_ev_le_conn_complete { __u8 status; __le16 handle; __u8 role; __u8 bdaddr_type; bdaddr_t bdaddr; __le16 interval; __le16 latency; __le16 supervision_timeout; __u8 clk_accurancy; } __packed; /* Advertising report event types */ #define LE_ADV_IND 0x00 #define LE_ADV_DIRECT_IND 0x01 #define LE_ADV_SCAN_IND 0x02 #define LE_ADV_NONCONN_IND 0x03 #define LE_ADV_SCAN_RSP 0x04 #define LE_ADV_INVALID 0x05 /* Legacy event types in extended adv report */ #define LE_LEGACY_ADV_IND 0x0013 #define LE_LEGACY_ADV_DIRECT_IND 0x0015 #define LE_LEGACY_ADV_SCAN_IND 0x0012 #define LE_LEGACY_NONCONN_IND 0x0010 #define LE_LEGACY_SCAN_RSP_ADV 0x001b #define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a /* Extended Advertising event types */ #define LE_EXT_ADV_NON_CONN_IND 0x0000 #define LE_EXT_ADV_CONN_IND 0x0001 #define LE_EXT_ADV_SCAN_IND 0x0002 #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 #define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 #define ADDR_LE_DEV_PUBLIC_RESOLVED 0x02 #define ADDR_LE_DEV_RANDOM_RESOLVED 0x03 #define HCI_EV_LE_ADVERTISING_REPORT 0x02 struct hci_ev_le_advertising_info { __u8 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 length; __u8 data[]; } __packed; struct hci_ev_le_advertising_report { __u8 num; struct hci_ev_le_advertising_info info[]; } __packed; #define HCI_EV_LE_CONN_UPDATE_COMPLETE 0x03 struct hci_ev_le_conn_update_complete { __u8 status; __le16 handle; __le16 interval; __le16 latency; __le16 supervision_timeout; } __packed; #define HCI_EV_LE_REMOTE_FEAT_COMPLETE 0x04 struct hci_ev_le_remote_feat_complete { __u8 status; __le16 handle; __u8 features[8]; } __packed; #define HCI_EV_LE_LTK_REQ 0x05 struct hci_ev_le_ltk_req { __le16 handle; __le64 rand; __le16 ediv; } __packed; #define HCI_EV_LE_REMOTE_CONN_PARAM_REQ 0x06 struct hci_ev_le_remote_conn_param_req { __le16 handle; __le16 interval_min; __le16 interval_max; __le16 latency; __le16 timeout; } __packed; #define HCI_EV_LE_DATA_LEN_CHANGE 0x07 struct hci_ev_le_data_len_change { __le16 handle; __le16 tx_len; __le16 tx_time; __le16 rx_len; __le16 rx_time; } __packed; #define HCI_EV_LE_DIRECT_ADV_REPORT 0x0B struct hci_ev_le_direct_adv_info { __u8 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 direct_addr_type; bdaddr_t direct_addr; __s8 rssi; } __packed; struct hci_ev_le_direct_adv_report { __u8 num; struct hci_ev_le_direct_adv_info info[]; } __packed; #define HCI_EV_LE_PHY_UPDATE_COMPLETE 0x0c struct hci_ev_le_phy_update_complete { __u8 status; __le16 handle; __u8 tx_phy; __u8 rx_phy; } __packed; #define HCI_EV_LE_EXT_ADV_REPORT 0x0d struct hci_ev_le_ext_adv_info { __le16 type; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 primary_phy; __u8 secondary_phy; __u8 sid; __u8 tx_power; __s8 rssi; __le16 interval; __u8 direct_addr_type; bdaddr_t direct_addr; __u8 length; __u8 data[]; } __packed; struct hci_ev_le_ext_adv_report { __u8 num; struct hci_ev_le_ext_adv_info info[]; } __packed; #define HCI_EV_LE_PA_SYNC_ESTABLISHED 0x0e struct hci_ev_le_pa_sync_established { __u8 status; __le16 handle; __u8 sid; __u8 bdaddr_type; bdaddr_t bdaddr; __u8 phy; __le16 interval; __u8 clock_accuracy; } __packed; #define HCI_EV_LE_ENHANCED_CONN_COMPLETE 0x0a struct hci_ev_le_enh_conn_complete { __u8 status; __le16 handle; __u8 role; __u8 bdaddr_type; bdaddr_t bdaddr; bdaddr_t local_rpa; bdaddr_t peer_rpa; __le16 interval; __le16 latency; __le16 supervision_timeout; __u8 clk_accurancy; } __packed; #define HCI_EV_LE_PER_ADV_REPORT 0x0f struct hci_ev_le_per_adv_report { __le16 sync_handle; __u8 tx_power; __u8 rssi; __u8 cte_type; __u8 data_status; __u8 length; __u8 data[]; } __packed; #define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 struct hci_evt_le_ext_adv_set_term { __u8 status; __u8 handle; __le16 conn_handle; __u8 num_evts; } __packed; #define HCI_EVT_LE_CIS_ESTABLISHED 0x19 struct hci_evt_le_cis_established { __u8 status; __le16 handle; __u8 cig_sync_delay[3]; __u8 cis_sync_delay[3]; __u8 c_latency[3]; __u8 p_latency[3]; __u8 c_phy; __u8 p_phy; __u8 nse; __u8 c_bn; __u8 p_bn; __u8 c_ft; __u8 p_ft; __le16 c_mtu; __le16 p_mtu; __le16 interval; } __packed; #define HCI_EVT_LE_CIS_REQ 0x1a struct hci_evt_le_cis_req { __le16 acl_handle; __le16 cis_handle; __u8 cig_id; __u8 cis_id; } __packed; #define HCI_EVT_LE_CREATE_BIG_COMPLETE 0x1b struct hci_evt_le_create_big_complete { __u8 status; __u8 handle; __u8 sync_delay[3]; __u8 transport_delay[3]; __u8 phy; __u8 nse; __u8 bn; __u8 pto; __u8 irc; __le16 max_pdu; __le16 interval; __u8 num_bis; __le16 bis_handle[]; } __packed; #define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d struct hci_evt_le_big_sync_estabilished { __u8 status; __u8 handle; __u8 latency[3]; __u8 nse; __u8 bn; __u8 pto; __u8 irc; __le16 max_pdu; __le16 interval; __u8 num_bis; __le16 bis[]; } __packed; #define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 struct hci_evt_le_big_info_adv_report { __le16 sync_handle; __u8 num_bis; __u8 nse; __le16 iso_interval; __u8 bn; __u8 pto; __u8 irc; __le16 max_pdu; __u8 sdu_interval[3]; __le16 max_sdu; __u8 phy; __u8 framing; __u8 encryption; } __packed; #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { __u16 type; __u8 data[]; } __packed; #define HCI_EV_SI_DEVICE 0x01 struct hci_ev_si_device { __u16 event; __u16 dev_id; } __packed; #define HCI_EV_SI_SECURITY 0x02 struct hci_ev_si_security { __u16 event; __u16 proto; __u16 subproto; __u8 incoming; } __packed; /* ---- HCI Packet structures ---- */ #define HCI_COMMAND_HDR_SIZE 3 #define HCI_EVENT_HDR_SIZE 2 #define HCI_ACL_HDR_SIZE 4 #define HCI_SCO_HDR_SIZE 3 #define HCI_ISO_HDR_SIZE 4 struct hci_command_hdr { __le16 opcode; /* OCF & OGF */ __u8 plen; } __packed; struct hci_event_hdr { __u8 evt; __u8 plen; } __packed; struct hci_acl_hdr { __le16 handle; /* Handle & Flags(PB, BC) */ __le16 dlen; } __packed; struct hci_sco_hdr { __le16 handle; __u8 dlen; } __packed; struct hci_iso_hdr { __le16 handle; __le16 dlen; __u8 data[]; } __packed; /* ISO data packet status flags */ #define HCI_ISO_STATUS_VALID 0x00 #define HCI_ISO_STATUS_INVALID 0x01 #define HCI_ISO_STATUS_NOP 0x02 #define HCI_ISO_DATA_HDR_SIZE 4 struct hci_iso_data_hdr { __le16 sn; __le16 slen; }; #define HCI_ISO_TS_DATA_HDR_SIZE 8 struct hci_iso_ts_data_hdr { __le32 ts; __le16 sn; __le16 slen; }; static inline struct hci_event_hdr *hci_event_hdr(const struct sk_buff *skb) { return (struct hci_event_hdr *) skb->data; } static inline struct hci_acl_hdr *hci_acl_hdr(const struct sk_buff *skb) { return (struct hci_acl_hdr *) skb->data; } static inline struct hci_sco_hdr *hci_sco_hdr(const struct sk_buff *skb) { return (struct hci_sco_hdr *) skb->data; } /* Command opcode pack/unpack */ #define hci_opcode_pack(ogf, ocf) ((__u16) ((ocf & 0x03ff)|(ogf << 10))) #define hci_opcode_ogf(op) (op >> 10) #define hci_opcode_ocf(op) (op & 0x03ff) /* ACL handle and flags pack/unpack */ #define hci_handle_pack(h, f) ((__u16) ((h & 0x0fff)|(f << 12))) #define hci_handle(h) (h & 0x0fff) #define hci_flags(h) (h >> 12) /* ISO handle and flags pack/unpack */ #define hci_iso_flags_pb(f) (f & 0x0003) #define hci_iso_flags_ts(f) ((f >> 2) & 0x0001) #define hci_iso_flags_pack(pb, ts) ((pb & 0x03) | ((ts & 0x01) << 2)) /* ISO data length and flags pack/unpack */ #define hci_iso_data_len_pack(h, f) ((__u16) ((h) | ((f) << 14))) #define hci_iso_data_len(h) ((h) & 0x3fff) #define hci_iso_data_flags(h) ((h) >> 14) /* codec transport types */ #define HCI_TRANSPORT_SCO_ESCO 0x01 /* le24 support */ static inline void hci_cpu_to_le24(__u32 val, __u8 dst[3]) { dst[0] = val & 0xff; dst[1] = (val & 0xff00) >> 8; dst[2] = (val & 0xff0000) >> 16; } #endif /* __HCI_H */ |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc root directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/fs_parser.h> #include <linux/cred.h> #include <linux/magic.h> #include <linux/slab.h> #include "internal.h" struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; enum proc_hidepid hidepid; int gid; enum proc_pidonly pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), {} }; static inline int valid_hidepid(unsigned int value) { return (value == HIDEPID_OFF || value == HIDEPID_NO_ACCESS || value == HIDEPID_INVISIBLE || value == HIDEPID_NOT_PTRACEABLE); } static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); struct fs_parse_result result; int base = (unsigned long)hidepid_u32_spec.data; if (param->type != fs_value_is_string) return invalf(fc, "proc: unexpected type of hidepid value\n"); if (!kstrtouint(param->string, base, &result.uint_32)) { if (!valid_hidepid(result.uint_32)) return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); ctx->hidepid = result.uint_32; return 0; } if (!strcmp(param->string, "off")) ctx->hidepid = HIDEPID_OFF; else if (!strcmp(param->string, "noaccess")) ctx->hidepid = HIDEPID_NO_ACCESS; else if (!strcmp(param->string, "invisible")) ctx->hidepid = HIDEPID_INVISIBLE; else if (!strcmp(param->string, "ptraceable")) ctx->hidepid = HIDEPID_NOT_PTRACEABLE; else return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); return 0; } static int proc_parse_subset_param(struct fs_context *fc, char *value) { struct proc_fs_context *ctx = fc->fs_private; while (value) { char *ptr = strchr(value, ','); if (ptr != NULL) *ptr++ = '\0'; if (*value != '\0') { if (!strcmp(value, "pid")) { ctx->pidonly = PROC_PIDONLY_ON; } else { return invalf(fc, "proc: unsupported subset option - %s\n", value); } } value = ptr; } return 0; } static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_gid: ctx->gid = result.uint_32; break; case Opt_hidepid: if (proc_parse_hidepid_param(fc, param)) return -EINVAL; break; case Opt_subset: if (proc_parse_subset_param(fc, param->string) < 0) return -EINVAL; break; default: return -EINVAL; } ctx->mask |= 1 << opt; return 0; } static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; struct proc_fs_info *fs_info; int ret; fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); if (!fs_info) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { pr_err("proc_fill_super: get root inode failed\n"); return -ENOMEM; } s->s_root = d_make_root(root_inode); if (!s->s_root) { pr_err("proc_fill_super: allocate dentry failed\n"); return -ENOMEM; } ret = proc_setup_self(s); if (ret) { return ret; } return proc_setup_thread_self(s); } static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; put_pid_ns(ctx->pid_ns); kfree(ctx); } static const struct fs_context_operations proc_fs_context_ops = { .free = proc_fs_context_free, .parse_param = proc_parse_param, .get_tree = proc_get_tree, .reconfigure = proc_reconfigure, }; static int proc_init_fs_context(struct fs_context *fc) { struct proc_fs_context *ctx; ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; } static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); if (!fs_info) { kill_anon_super(sb); return; } dput(fs_info->proc_self); dput(fs_info->proc_thread_self); kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); kfree(fs_info); } static struct file_system_type proc_fs_type = { .name = "proc", .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) { proc_init_kmemcache(); set_proc_pid_nlink(); proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); /* * Last things last. It is not like userspace processes eager * to open /proc files exist at this point but register last * anyway. */ register_filesystem(&proc_fs_type); } static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { int error = proc_readdir(file, ctx); if (unlikely(error <= 0)) return error; ctx->pos = FIRST_PROCESS_ENTRY; } return proc_pid_readdir(file, ctx); } /* * The root /proc directory is special, as it has the * <pid> directories. Thus we don't use the generic * directory handling functions for that.. */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, .iterate_shared = proc_root_readdir, .llseek = generic_file_llseek, }; /* * proc root can do almost nothing.. */ static const struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, .getattr = proc_root_getattr, }; /* * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", }; |
| 2302 2968 118 5782 5708 2967 2890 1575 1575 1573 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Lock inodes for any further data updates by setting COMMITTING state * [2] Submit data buffers of all the inodes * [3] Wait for [2] to complete * [4] Commit all the directory entry updates in the fast commit space * [5] Commit all the changed inode structures * [6] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * [7] Wait for [4], [5] and [6] to complete. * * All the inode updates must call ext4_fc_start_update() before starting an * update. If such an ongoing update is present, fast commit waits for it to * complete. The completion of such an update is marked by * ext4_fc_stop_update(). * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Fast commit's commit path locks the entire file system during fast * commit. This has significant performance penalty. Instead of that, we * should use ext4_fc_start/stop_update functions to start inode level * updates from ext4_journal_start/stop. Once we do that we can drop file * system locking during commit path. * * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); } /* This function must be called with sbi->s_fc_lock held. */ static void ext4_fc_wait_committing_inode(struct inode *inode) __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) { wait_queue_head_t *wq; struct ext4_inode_info *ei = EXT4_I(inode); #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); schedule(); finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Inform Ext4's fast about start of an inode update * * This function is called by the high level call VFS callbacks before * performing any inode update. This function blocks if there's an ongoing * fast commit on the inode in question. */ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list)) goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } out: atomic_inc(&ei->i_fc_updates); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } /* * Stop inode update and wake up waiting fast commits if any. */ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) wake_up_all(&ei->i_fc_wait); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); return; } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } if (!list_empty(&ei->i_fc_list)) list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); tid = sbi->s_journal->j_running_transaction ? sbi->s_journal->j_running_transaction->t_tid : 0; read_unlock(&sbi->s_journal->j_state_lock); } spin_lock(&sbi->s_fc_lock); if (sbi->s_fc_ineligible_tid < tid) sbi->s_fc_ineligible_tid = tid; spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); if (!enqueue) return ret; spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, NULL); mutex_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, dentry->d_name.len); } else { memcpy(node->fcd_iname, dentry->d_name.name, dentry->d_name.len); node->fcd_name.name = node->fcd_iname; } node->fcd_name.len = dentry->d_name.len; INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); mutex_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; mutex_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { mutex_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&ei->i_fc_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&ei->i_fc_updates)) { spin_unlock(&sbi->s_fc_lock); schedule(); spin_lock(&sbi->s_fc_lock); } finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return ret; } /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { if (!ext4_test_inode_state(&pos->vfs_inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = jbd2_wait_inode_data(journal, pos->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode pointer */ WARN_ON(list_empty(&fc_dentry->fcd_dilist)); ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); spin_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) goto lock_and_exit; ret = ext4_fc_write_inode_data(inode, crc); if (ret) goto lock_and_exit; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: spin_lock(&sbi->s_fc_lock); return ret; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; ret = ext4_fc_submit_inode_data_all(journal); if (ret) return ret; ret = ext4_fc_wait_inode_data_all(journal); if (ret) return ret; /* * If file system device is different from journal device, issue a cache * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* * Add a head tag only if this is the first fast commit * in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { spin_unlock(&sbi->s_fc_lock); goto out; } list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_tail(sb, crc); out: blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); if (iter->i_sync_tid <= tid) ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; spin_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); if (ret) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); } |
| 9 9 9 9 6 3 6 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 6 9 9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Forwarding decision * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <linux/netfilter_bridge.h> #include "br_private.h" /* Don't forward packets to originating port or forwarding disabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { struct net_bridge_vlan_group *vg; vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_push(skb, ETH_HLEN); if (!is_skb_forwardable(skb->dev, skb)) goto drop; br_drop_fake_rtable(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && eth_type_vlan(skb->protocol)) { int depth; if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); } br_switchdev_frame_set_offload_fwd_mark(skb); dev_queue_xmit(skb); return 0; drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_tstamp(skb); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); } EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_orig) { struct net_bridge_vlan_group *vg; struct net_device *indev; struct net *net; int br_hook; /* Mark the skb for forwarding offload early so that br_handle_vlan() * can know whether to pop the VLAN header on egress or keep it. */ nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) return; indev = skb->dev; skb->dev = to->dev; if (!local_orig) { if (skb_warn_if_lro(skb)) { kfree_skb(skb); return; } br_hook = NF_BR_FORWARD; skb_forward_csum(skb); net = dev_net(indev); } else { if (unlikely(netpoll_tx_running(to->br->dev))) { skb_push(skb, ETH_HLEN); if (!is_skb_forwardable(skb->dev, skb)) kfree_skb(skb); else br_netpoll_send_skb(to, skb); return; } br_hook = NF_BR_LOCAL_OUT; net = dev_net(skb->dev); indev = NULL; } NF_HOOK(NFPROTO_BRIDGE, br_hook, net, NULL, skb, indev, skb->dev, br_forward_finish); } static int deliver_clone(const struct net_bridge_port *prev, struct sk_buff *skb, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { DEV_STATS_INC(dev, tx_dropped); return -ENOMEM; } __br_forward(prev, skb, local_orig); return 0; } /** * br_forward - forward a packet to a specific port * @to: destination port * @skb: packet being forwarded * @local_rcv: packet will be received locally after forwarding * @local_orig: packet is locally originated * * Should be called with rcu_read_lock. */ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig) { if (unlikely(!to)) goto out; /* redirect to backup link if the destination port is down */ if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { struct net_bridge_port *backup_port; backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } if (should_deliver(to, skb)) { if (local_rcv) deliver_clone(to, skb, local_orig); else __br_forward(to, skb, local_orig); return; } out: if (!local_rcv) kfree_skb(skb); } EXPORT_SYMBOL_GPL(br_forward); static struct net_bridge_port *maybe_deliver( struct net_bridge_port *prev, struct net_bridge_port *p, struct sk_buff *skb, bool local_orig) { u8 igmp_type = br_multicast_igmp_type(skb); int err; if (!should_deliver(p, skb)) return prev; nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); if (!prev) goto out; err = deliver_clone(prev, skb, local_orig); if (err) return ERR_PTR(err); out: br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX); return p; } /* called under rcu_read_lock */ void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid) { struct net_bridge_port *prev = NULL; struct net_bridge_port *p; br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST); list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off, nor * other traffic if flood off, except for traffic we originate */ switch (pkt_type) { case BR_PKT_UNICAST: if (!(p->flags & BR_FLOOD)) continue; break; case BR_PKT_MULTICAST: if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; break; case BR_PKT_BROADCAST: if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev) continue; break; } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; if (BR_INPUT_SKB_CB(skb)->proxyarp_replied && ((p->flags & BR_PROXYARP_WIFI) || br_is_neigh_suppress_enabled(p, vid))) continue; prev = maybe_deliver(prev, p, skb, local_orig); if (IS_ERR(prev)) goto out; } if (!prev) goto out; if (local_rcv) deliver_clone(prev, skb, local_orig); else __br_forward(prev, skb, local_orig); return; out: if (!local_rcv) kfree_skb(skb); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, const unsigned char *addr, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; const unsigned char *src = eth_hdr(skb)->h_source; if (!should_deliver(p, skb)) return; /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */ if (skb->dev == p->dev && ether_addr_equal(src, addr)) return; skb = skb_copy(skb, GFP_ATOMIC); if (!skb) { DEV_STATS_INC(dev, tx_dropped); return; } if (!is_broadcast_ether_addr(addr)) memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN); __br_forward(p, skb, local_orig); } /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; struct hlist_node *rp; rp = br_multicast_get_first_rport_node(brmctx, skb); if (mdst) { p = rcu_dereference(mdst->ports); if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) && br_multicast_is_star_g(&mdst->addr)) allow_mode_include = false; } else { p = NULL; br_tc_skb_miss_set(skb, true); } while (p || rp) { struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; rport = br_multicast_rport_from_node_skb(rp, skb); if ((unsigned long)lport > (unsigned long)rport) { port = lport; if (port->flags & BR_MULTICAST_TO_UNICAST) { maybe_deliver_addr(lport, skb, p->eth_addr, local_orig); goto delivered; } if ((!allow_mode_include && p->filter_mode == MCAST_INCLUDE) || (p->flags & MDB_PG_FLAGS_BLOCKED)) goto delivered; } else { port = rport; } prev = maybe_deliver(prev, port, skb, local_orig); if (IS_ERR(prev)) goto out; delivered: if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); if ((unsigned long)rport >= (unsigned long)port) rp = rcu_dereference(hlist_next_rcu(rp)); } if (!prev) goto out; if (local_rcv) deliver_clone(prev, skb, local_orig); else __br_forward(prev, skb, local_orig); return; out: if (!local_rcv) kfree_skb(skb); } #endif |
| 157 2336 2326 7255 2194 12 11 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/sched.h> #include <linux/thread_info.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } #ifdef INLINE_COPY_FROM_USER static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } #else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (check_copy_size(to, n, false)) n = _copy_from_user(to, from, n); return n; } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (check_copy_size(from, n, true)) n = _copy_to_user(to, from, n); return n; } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifndef __get_kernel_nofault #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */ |
| 9640 1648 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Filesystem management and hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_FS_H #define _SECURITY_LANDLOCK_FS_H #include <linux/fs.h> #include <linux/init.h> #include <linux/rcupdate.h> #include "ruleset.h" #include "setup.h" /** * struct landlock_inode_security - Inode security blob * * Enable to reference a &struct landlock_object tied to an inode (i.e. * underlying object). */ struct landlock_inode_security { /** * @object: Weak pointer to an allocated object. All assignments of a * new object are protected by the underlying inode->i_lock. However, * atomically disassociating @object from the inode is only protected * by @object->lock, from the time @object's usage refcount drops to * zero to the time this pointer is nulled out (cf. release_inode() and * hook_sb_delete()). Indeed, such disassociation doesn't require * inode->i_lock thanks to the careful rcu_access_pointer() check * performed by get_inode_object(). */ struct landlock_object __rcu *object; }; /** * struct landlock_file_security - File security blob * * This information is populated when opening a file in hook_file_open, and * tracks the relevant Landlock access rights that were available at the time * of opening the file. Other LSM hooks use these rights in order to authorize * operations on already opened files. */ struct landlock_file_security { /** * @allowed_access: Access rights that were available at the time of * opening the file. This is not necessarily the full set of access * rights available at that time, but it's the necessary subset as * needed to authorize later operations on the open file. */ access_mask_t allowed_access; }; /** * struct landlock_superblock_security - Superblock security blob * * Enable hook_sb_delete() to wait for concurrent calls to release_inode(). */ struct landlock_superblock_security { /** * @inode_refs: Number of pending inodes (from this superblock) that * are being released by release_inode(). * Cf. struct super_block->s_fsnotify_inode_refs . */ atomic_long_t inode_refs; }; static inline struct landlock_file_security * landlock_file(const struct file *const file) { return file->f_security + landlock_blob_sizes.lbs_file; } static inline struct landlock_inode_security * landlock_inode(const struct inode *const inode) { return inode->i_security + landlock_blob_sizes.lbs_inode; } static inline struct landlock_superblock_security * landlock_superblock(const struct super_block *const superblock) { return superblock->s_security + landlock_blob_sizes.lbs_superblock; } __init void landlock_add_fs_hooks(void); int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, const struct path *const path, access_mask_t access_hierarchy); #endif /* _SECURITY_LANDLOCK_FS_H */ |
| 310 310 310 310 310 310 310 310 310 310 310 310 6700 7 6703 6702 6700 6701 6700 6700 6702 6699 6704 6542 6541 6542 6541 6540 6542 6540 6542 6700 6385 174 1 172 5662 6382 2 1 2 2303 5782 27 25 25 7 1 5 6 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | /* * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> #include <linux/bitops.h> #include <linux/export.h> #include "rds.h" /* * This file implements the receive side of the unconventional congestion * management in RDS. * * Messages waiting in the receive queue on the receiving socket are accounted * against the sockets SO_RCVBUF option value. Only the payload bytes in the * message are accounted for. If the number of bytes queued equals or exceeds * rcvbuf then the socket is congested. All sends attempted to this socket's * address should return block or return -EWOULDBLOCK. * * Applications are expected to be reasonably tuned such that this situation * very rarely occurs. An application encountering this "back-pressure" is * considered a bug. * * This is implemented by having each node maintain bitmaps which indicate * which ports on bound addresses are congested. As the bitmap changes it is * sent through all the connections which terminate in the local address of the * bitmap which changed. * * The bitmaps are allocated as connections are brought up. This avoids * allocation in the interrupt handling path which queues messages on sockets. * The dense bitmaps let transports send the entire bitmap on any bitmap change * reasonably efficiently. This is much easier to implement than some * finer-grained communication of per-port congestion. The sender does a very * inexpensive bit test to test if the port it's about to send to is congested * or not. */ /* * Interaction with poll is a tad tricky. We want all processes stuck in * poll to wake up and check whether a congested destination became uncongested. * The really sad thing is we have no idea which destinations the application * wants to send to - we don't even know which rds_connections are involved. * So until we implement a more flexible rds poll interface, we have to make * do with this: * We maintain a global counter that is incremented each time a congestion map * update is received. Each rds socket tracks this value, and if rds_poll * finds that the saved generation number is smaller than the global generation * number, it wakes up the process. */ static atomic_t rds_cong_generation = ATOMIC_INIT(0); /* * Congestion monitoring */ static LIST_HEAD(rds_cong_monitor); static DEFINE_RWLOCK(rds_cong_monitor_lock); /* * Yes, a global lock. It's used so infrequently that it's worth keeping it * global to simplify the locking. It's only used in the following * circumstances: * * - on connection buildup to associate a conn with its maps * - on map changes to inform conns of a new map to send * * It's sadly ordered under the socket callback lock and the connection lock. * Receive paths can mark ports congested from interrupt context so the * lock masks interrupts. */ static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; struct rb_node *parent = NULL; struct rds_cong_map *map; while (*p) { int diff; parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); diff = rds_addr_cmp(addr, &map->m_addr); if (diff < 0) p = &(*p)->rb_left; else if (diff > 0) p = &(*p)->rb_right; else return map; } if (insert) { rb_link_node(&insert->m_rb_node, parent, p); rb_insert_color(&insert->m_rb_node, &rds_cong_tree); } return NULL; } /* * There is only ever one bitmap for any address. Connections try and allocate * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; unsigned long zp; unsigned long i; unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); if (!map) return NULL; map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { zp = get_zeroed_page(GFP_KERNEL); if (zp == 0) goto out; map->m_page_addrs[i] = zp; } spin_lock_irqsave(&rds_cong_lock, flags); ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); if (!ret) { ret = map; map = NULL; } out: if (map) { for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } /* * Put the conn on its local map's list. This is called when the conn is * really added to the hash. It's nested under the rds_conn_lock, sadly. */ void rds_cong_add_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_remove_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_del_init(&conn->c_map_item); spin_unlock_irqrestore(&rds_cong_lock, flags); } int rds_cong_get_maps(struct rds_connection *conn) { conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; } void rds_cong_queue_updates(struct rds_cong_map *map) { struct rds_connection *conn; unsigned long flags; spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { struct rds_conn_path *cp = &conn->c_path[0]; rcu_read_lock(); if (!test_and_set_bit(0, &conn->c_map_queued) && !rds_destroy_pending(cp->cp_conn)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): * 1. When we get here from the receive path, we * are already holding the sock_lock (held by * tcp_v4_rcv()). So inlining calls to * tcp_setsockopt and/or tcp_sendmsg will deadlock * when it tries to get the sock_lock()) * 2. Interrupts are masked so that we can mark the * port congested from both send and recv paths. * (See comment around declaration of rdc_cong_lock). * An attempt to get the sock_lock() here will * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { rdsdebug("waking map %p for %pI4\n", map, &map->m_addr); rds_stats_inc(s_cong_update_received); atomic_inc(&rds_cong_generation); if (waitqueue_active(&map->m_waitq)) wake_up(&map->m_waitq); if (waitqueue_active(&rds_poll_waitq)) wake_up_all(&rds_poll_waitq); if (portmask && !list_empty(&rds_cong_monitor)) { unsigned long flags; struct rds_sock *rs; read_lock_irqsave(&rds_cong_monitor_lock, flags); list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { spin_lock(&rs->rs_lock); rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); rs->rs_cong_mask &= ~portmask; spin_unlock(&rs->rs_lock); if (rs->rs_cong_notify) rds_wake_sk_sleep(rs); } read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { unsigned long gen = atomic_read(&rds_cong_generation); if (likely(*recent == gen)) return 0; *recent = gen; return 1; } /* * We're called under the locking that protects the sockets receive buffer * consumption. This makes it a lot easier for the caller to only call us * when it knows that an existing set bit needs to be cleared, and vice versa. * We can't block and we need to deal with concurrent sockets working against * the same per-address map. */ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("setting congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("clearing congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) { unsigned long flags; write_lock_irqsave(&rds_cong_monitor_lock, flags); if (list_empty(&rs->rs_cong_list)) list_add(&rs->rs_cong_list, &rds_cong_monitor); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); } void rds_cong_remove_socket(struct rds_sock *rs) { unsigned long flags; struct rds_cong_map *map; write_lock_irqsave(&rds_cong_monitor_lock, flags); list_del_init(&rs->rs_cong_list); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { rds_cong_clear_bit(map, rs->rs_bound_port); rds_cong_queue_updates(map); } } int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs) { if (!rds_cong_test_bit(map, port)) return 0; if (nonblock) { if (rs && rs->rs_cong_monitor) { unsigned long flags; /* It would have been nice to have an atomic set_bit on * a uint64_t. */ spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); spin_unlock_irqrestore(&rs->rs_lock, flags); /* Test again - a congestion update may have arrived in * the meantime. */ if (!rds_cong_test_bit(map, port)) return 0; } rds_stats_inc(s_cong_send_error); return -ENOBUFS; } rds_stats_inc(s_cong_send_blocked); rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); return wait_event_interruptible(map->m_waitq, !rds_cong_test_bit(map, port)); } void rds_cong_exit(void) { struct rb_node *node; struct rds_cong_map *map; unsigned long i; while ((node = rb_first(&rds_cong_tree))) { map = rb_entry(node, struct rds_cong_map, m_rb_node); rdsdebug("freeing map %p\n", map); rb_erase(&map->m_rb_node, &rds_cong_tree); for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } } /* * Allocate a RDS message containing a congestion update. */ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) { struct rds_cong_map *map = conn->c_lcong; struct rds_message *rm; rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); if (!IS_ERR(rm)) rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; return rm; } |
| 1863 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> */ #include <linux/if_arp.h> #include <linux/module.h> #include <net/6lowpan.h> #include <net/addrconf.h> #include "6lowpan_i.h" int lowpan_register_netdevice(struct net_device *dev, enum lowpan_lltypes lltype) { int i, ret; switch (lltype) { case LOWPAN_LLTYPE_IEEE802154: dev->addr_len = EUI64_ADDR_LEN; break; case LOWPAN_LLTYPE_BTLE: dev->addr_len = ETH_ALEN; break; } dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; lowpan_dev(dev)->lltype = lltype; spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; dev->ndisc_ops = &lowpan_ndisc_ops; ret = register_netdevice(dev); if (ret < 0) return ret; lowpan_dev_debugfs_init(dev); return ret; } EXPORT_SYMBOL(lowpan_register_netdevice); int lowpan_register_netdev(struct net_device *dev, enum lowpan_lltypes lltype) { int ret; rtnl_lock(); ret = lowpan_register_netdevice(dev, lltype); rtnl_unlock(); return ret; } EXPORT_SYMBOL(lowpan_register_netdev); void lowpan_unregister_netdevice(struct net_device *dev) { unregister_netdevice(dev); lowpan_dev_debugfs_exit(dev); } EXPORT_SYMBOL(lowpan_unregister_netdevice); void lowpan_unregister_netdev(struct net_device *dev) { rtnl_lock(); lowpan_unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(lowpan_unregister_netdev); int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; /* Set short_addr autoconfiguration if short_addr is present only */ if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) return -1; /* For either address format, all zero addresses MUST NOT be used */ if (wpan_dev->pan_id == cpu_to_le16(0x0000) && wpan_dev->short_addr == cpu_to_le16(0x0000)) return -1; /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) memset(eui, 0, 2); else ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); /* The "Universal/Local" (U/L) bit shall be set to zero */ eui[0] &= ~2; eui[2] = 0; eui[3] = 0xFF; eui[4] = 0xFE; eui[5] = 0; ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); return 0; } static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev; struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; idev = __in6_dev_get(dev); if (!idev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: case NETDEV_CHANGE: /* (802.15.4 6LoWPAN short address slaac handling */ if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { __ipv6_addr_set_half(&addr.s6_addr32[0], htonl(0xFE800000), 0); addrconf_add_linklocal(idev, &addr, 0); } break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_notifier = { .notifier_call = lowpan_event, }; static int __init lowpan_module_init(void) { int ret; lowpan_debugfs_init(); ret = register_netdevice_notifier(&lowpan_notifier); if (ret < 0) { lowpan_debugfs_exit(); return ret; } request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); request_module_nowait("nhc_ipv6"); request_module_nowait("nhc_mobility"); request_module_nowait("nhc_routing"); request_module_nowait("nhc_udp"); return 0; } static void __exit lowpan_module_exit(void) { lowpan_debugfs_exit(); unregister_netdevice_notifier(&lowpan_notifier); } module_init(lowpan_module_init); module_exit(lowpan_module_exit); MODULE_LICENSE("GPL"); |
| 2879 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM kmem #if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, s, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) __field( int, node ) __field( bool, accounted ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = s->object_size; __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? ((gfp_flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)) : false; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, __entry->accounted ? "true" : "false") ); TRACE_EVENT(kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) __field( int, node ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, (IS_ENABLED(CONFIG_MEMCG_KMEM) && (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); TRACE_EVENT(kfree, TP_PROTO(unsigned long call_site, const void *ptr), TP_ARGS(call_site, ptr), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; ), TP_printk("call_site=%pS ptr=%p", (void *)__entry->call_site, __entry->ptr) ); TRACE_EVENT(kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s), TP_ARGS(call_site, ptr, s), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __string( name, s->name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __assign_str(name, s->name); ), TP_printk("call_site=%pS ptr=%p name=%s", (void *)__entry->call_site, __entry->ptr, __get_str(name)) ); TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), TP_ARGS(page, order), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->order = order; ), TP_printk("page=%p pfn=0x%lx order=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order) ); TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); ), TP_printk("page=%p pfn=0x%lx order=0", pfn_to_page(__entry->pfn), __entry->pfn) ); TRACE_EVENT(mm_page_alloc, TP_PROTO(struct page *page, unsigned int order, gfp_t gfp_flags, int migratetype), TP_ARGS(page, order, gfp_flags, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( unsigned long, gfp_flags ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_page, TP_PROTO(struct page *page, unsigned int order, int migratetype, int percpu_refill), TP_ARGS(page, order, migratetype, percpu_refill), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) __field( int, percpu_refill ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; __entry->percpu_refill = percpu_refill; ), TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, __entry->percpu_refill) ); DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked, TP_PROTO(struct page *page, unsigned int order, int migratetype, int percpu_refill), TP_ARGS(page, order, migratetype, percpu_refill) ); TRACE_EVENT(mm_page_pcpu_drain, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order, __entry->migratetype) ); TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, int alloc_migratetype, int fallback_migratetype), TP_ARGS(page, alloc_order, fallback_order, alloc_migratetype, fallback_migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( int, alloc_order ) __field( int, fallback_order ) __field( int, alloc_migratetype ) __field( int, fallback_migratetype ) __field( int, change_ownership ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->alloc_order = alloc_order; __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; __entry->change_ownership = (alloc_migratetype == get_pageblock_migratetype(page)); ), TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->alloc_order, __entry->fallback_order, pageblock_order, __entry->alloc_migratetype, __entry->fallback_migratetype, __entry->fallback_order < pageblock_order, __entry->change_ownership) ); /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ #ifndef __PTR_TO_HASHVAL static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) { int ret; unsigned long hashval; ret = ptr_to_hashval(ptr, &hashval); if (ret) return 0; /* The hashed value is only 32-bit */ return (unsigned int)hashval; } #define __PTR_TO_HASHVAL #endif #define TRACE_MM_PAGES \ EM(MM_FILEPAGES) \ EM(MM_ANONPAGES) \ EM(MM_SWAPENTS) \ EMe(MM_SHMEMPAGES) #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); TRACE_MM_PAGES #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, int member), TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) __field(unsigned int, curr) __field(int, member) __field(long, size) ), TP_fast_assign( __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 17 17 17 9 17 17 17 17 17 17 17 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); } void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } |
| 9 3216 1 10602 26 26 26 530 1 14341 14337 198 199 1840 567 1872 5253 1710 2486 469 212 1380 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ #include <linux/bug.h> #include <linux/const.h> #include <linux/limits.h> #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) void fortify_panic(const char *name) __noreturn __cold; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); #define __compiletime_strlen(p) \ ({ \ char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ const size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ __ret = __builtin_strlen(__p); \ } \ __ret; \ }) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #if defined(__SANITIZE_MEMORY__) /* * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the * corresponding __msan_XXX functions. */ #include <linux/kmsan_string.h> #define __underlying_memcpy __msan_memcpy #define __underlying_memmove __msan_memmove #define __underlying_memset __msan_memset #else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #endif #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif /** * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking * * @dst: Destination memory address to write to * @src: Source memory address to read from * @bytes: How many bytes to write to @dst from @src * @justification: Free-form text or comment describing why the use is needed * * This should be used for corner cases where the compiler cannot do the * right thing, or during transitions between APIs, etc. It should be used * very rarely, and includes a place for justification detailing where bounds * checking has happened, and why existing solutions cannot be employed. */ #define unsafe_memcpy(dst, src, bytes, justification) \ __underlying_memcpy(dst, src, bytes) /* * Clang's use of __builtin_*object_size() within inlines needs hinting via * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ #if __has_builtin(__builtin_dynamic_object_size) #define POS __pass_dynamic_object_size(1) #define POS0 __pass_dynamic_object_size(0) #else #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) #endif #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ (bounds) < (length) \ ) /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, * and @p will NOT be NUL-terminated * * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes * will be written to @p until @size total bytes have been written. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads of @q, it cannot defend against writing unterminated * results to @p. Using strncpy() remains ambiguous and fragile. * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * * +--------------------+--------------------+------------+ * | **p** needs to be: | padded to **size** | not padded | * +====================+====================+============+ * | NUL-terminated | strscpy_pad() | strscpy() | * +--------------------+--------------------+------------+ * | not NUL-terminated | strtomem_pad() | strtomem() | * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with * __nonstring when it is a character array. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { const size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_strncpy(p, q, size); } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); /** * strnlen - Return bounded count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * @maxlen: maximum number of characters to count. * * Returns number of characters in @p (NOT including the final NUL), or * @maxlen, if no NUL has been found up to there. * */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { const size_t p_size = __member_size(p); const size_t p_len = __compiletime_strlen(p); size_t ret; /* We can take compile-time actions when maxlen is const. */ if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; } /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(__func__); return ret; } /* * Defined after fortified strnlen to reuse it. However, it must still be * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ /** * strlen - Return count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * * Do not use this function unless the string length is known at * compile-time. When @p is unterminated, this function may crash * or return unexpected counts that could lead to memory content * exposures. Prefer strnlen(). * * Returns number of characters in @p (NOT including the final NUL). * */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { const size_t p_size = __member_size(p); __kernel_size_t ret; /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); return ret; } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); /** * strlcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: maximum number of bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will be truncated at * @size - 1 bytes. @p will always be NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads when calculating strlen(@q), it is still possible. * Prefer strscpy(), though note its different return values for * detecting truncation. * * Returns total number of bytes written to @p, including terminating NUL. * */ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t q_len; /* Full count of source string length. */ size_t len; /* Count of characters going into destination. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcpy(p, q, size); q_len = strlen(q); len = (q_len >= size) ? size - 1 : q_len; if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) { /* Write size is always larger than destination. */ if (len >= p_size) __write_overflow(); } if (size) { if (len >= p_size) fortify_panic(__func__); __underlying_memcpy(p, q, len); p[len] = '\0'; } return q_len; } /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); /** * strscpy - Copy a C-string into a sized buffer * * @p: Where to copy the string to * @q: Where to copy the string from * @size: Size of destination buffer * * Copy the source string @q, or as much of it as fits, into the destination * @p buffer. The behavior is undefined if the string buffers overlap. The * destination @p buffer is always NUL terminated, unless it's zero-sized. * * Preferred to strlcpy() since the API doesn't require reading memory * from the source @q string beyond the specified @size bytes, and since * the return value is easier to error-check than strlcpy()'s. * In addition, the implementation is robust to the string changing out * from underneath it, unlike the current strlcpy() implementation. * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). * * Returns the number of characters copied in @p (not including the * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated. */ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t len; /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Short-circuit for compile-time known-safe lengths. */ if (__compiletime_lessthan(p_size, SIZE_MAX)) { len = __compiletime_strlen(q); if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { __underlying_memcpy(p, q, len + 1); return len; } } /* * This call protects from read overflow, because len will default to q * length if it smaller than size. */ len = strnlen(q, size); /* * If len equals size, we will copy only size bytes which leads to * -E2BIG being returned. * Otherwise we will copy len + 1 because of the final '\O'. */ len = len == size ? size : len + 1; /* * Generate a runtime write overflow error if len is greater than * p_size. */ if (len > p_size) fortify_panic(__func__); /* * We can now safely call vanilla strscpy because we are protected from: * 1. Read overflow thanks to call to strnlen(). * 2. Write overflow thanks to above ifs. */ return __real_strscpy(p, q, len); } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat); /** * strlcat - Append a string to an existing string * * @p: pointer to %NUL-terminated string to append to * @q: pointer to %NUL-terminated string to append from * @avail: Maximum bytes available in @p * * Appends %NUL-terminated string @q after the %NUL-terminated * string at @p, but will not write beyond @avail bytes total, * potentially truncating the copy from @q. @p will stay * %NUL-terminated only if a %NUL already existed within * the @avail bytes of @p. If so, the resulting number of * bytes copied from @q will be at most "@avail - strlen(@p) - 1". * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf(), seq_buf, or similar. * * Returns total bytes that _would_ have been contained by @p * regardless of truncation, similar to snprintf(). If return * value is >= @avail, the string has been truncated. * */ __FORTIFY_INLINE size_t strlcat(char * const POS p, const char * const POS q, size_t avail) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; size_t actual, wanted; /* Give up immediately if both buffer sizes are unknown. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcat(p, q, avail); p_len = strnlen(p, avail); copy_len = strlen(q); wanted = actual = p_len + copy_len; /* Cannot append any more: report truncation. */ if (avail <= p_len) return wanted; /* Give up if string is already overflowed. */ if (p_size <= p_len) fortify_panic(__func__); if (actual >= avail) { copy_len = avail - p_len - 1; actual = p_len + copy_len; } /* Give up if copy will overflow. */ if (p_size <= actual) fortify_panic(__func__); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; return wanted; } /* Defined after fortified strlcat() to reuse it. */ /** * strcat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to NUL-terminated source string to append from * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the * destination buffer size is known to the compiler. Prefer * building the string with formatting, via scnprintf() or similar. * At the very least, use strncat(). * * Returns @p. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; } /** * strncat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to source string to append from * @count: Maximum bytes to read from @q * * Appends at most @count bytes from @q (stopping at the first * NUL byte) after the NUL-terminated string at @p. @p will be * NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf() or similar. * * Returns @p. * */ /* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic("memset"); } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ size_t __fortify_size = (size_t)(size); \ fortify_memset_chk(__fortify_size, p_size, p_size_field), \ __underlying_memset(p, c, __fortify_size); \ }) /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __struct_size(p), __member_size(p)) #endif /* * To make sure the compiler can enforce protection against buffer overflows, * memcpy(), memmove(), and memset() must not be used beyond individual * struct members. If you need to copy across multiple members, please use * struct_group() to create a named mirror of an anonymous struct union. * (e.g. see struct sk_buff.) Read overflow checking is currently only * done when a write overflow is also present, or when building with W=1. * * Mitigation coverage matrix * Bounds checking at: * +-------+-------+-------+-------+ * | Compile time | Run time | * memcpy() argument sizes: | write | read | write | read | * dest source length +-------+-------+-------+-------+ * memcpy(known, known, constant) | y | y | n/a | n/a | * memcpy(known, unknown, constant) | y | n | n/a | V | * memcpy(known, known, dynamic) | n | n | B | B | * memcpy(known, unknown, dynamic) | n | n | B | V | * memcpy(unknown, known, constant) | n | y | V | n/a | * memcpy(unknown, unknown, constant) | n | n | V | V | * memcpy(unknown, known, dynamic) | n | n | V | B | * memcpy(unknown, unknown, dynamic) | n | n | V | V | * +-------+-------+-------+-------+ * * y = perform deterministic compile-time bounds checking * n = cannot perform deterministic compile-time bounds checking * n/a = no run-time bounds checking needed since compile-time deterministic * B = can perform run-time bounds checking (currently unimplemented) * V = vulnerable to run-time overflow (will need refactoring to solve) * */ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, const char *func) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); if (__compiletime_lessthan(q_size_field, q_size) && __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || __compiletime_lessthan(p_size_field, size)) && __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if ((p_size != SIZE_MAX && p_size < size) || (q_size != SIZE_MAX && q_size < size)) fortify_panic(func); /* * Warn when writing beyond destination field size. * * We must ignore p_size_field == 0 for existing 0-element * fake flexible arrays, until they are all converted to * proper flexible arrays. * * The implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be * detected at compile-time (as can be done when the destination * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ if (p_size_field != 0 && p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; return false; } #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ const size_t __p_size = (p_size); \ const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, #op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ __underlying_##op(p, q, __fortify_size); \ }) /* * Notes about compile-time buffer size detection: * * With these types... * * struct middle { * u16 a; * u8 middle_buf[16]; * int b; * }; * struct end { * u16 a; * u8 end_buf[16]; * }; * struct flex { * int a; * u8 flex_buf[]; * }; * * void func(TYPE *ptr) { ... } * * Cases where destination size cannot be currently detected: * - the size of ptr's object (seemingly by design, gcc & clang fail): * __builtin_object_size(ptr, 1) == SIZE_MAX * - the size of flexible arrays in ptr's obj (by design, dynamic size): * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX * - the size of ANY array at the end of ptr's obj (gcc and clang bug): * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 * * Cases where destination size is currently detected: * - the size of non-array members within ptr's object: * __builtin_object_size(ptr->a, 1) == 2 * - the size of non-flexible-array in the middle of ptr's obj: * __builtin_object_size(ptr->middle_buf, 1) == 16 * */ /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memscan(p, c, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { const size_t p_size = __struct_size(p); const size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size || q_size < size) fortify_panic(__func__); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) __realloc_size(2); __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_kmemdup(p, size, gfp); } /** * strcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * * Do not use this function. While FORTIFY_SOURCE tries to avoid * overflows, this is only possible when the sizes of @q and @p are * known to the compiler. Prefer strscpy(), though note its different * return values for detecting truncation. * * Returns @p. * */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ if (__builtin_constant_p(p_size) && __builtin_constant_p(q_size) && p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(__func__); __underlying_memcpy(p, q, size); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #undef POS #undef POS0 #endif /* _LINUX_FORTIFY_STRING_H_ */ |
| 1081 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 | // SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * * This implementation is a fallback for platforms that do not support * I/O TLBs (aka DMA address translation hardware). * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> * Copyright (C) 2000, 2003 Hewlett-Packard Co * David Mosberger-Tang <davidm@hpl.hp.com> * * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid * unnecessary i-cache flushing. * 04/07/.. ak Better overflow handling. Assorted fixes. * 05/09/10 linville Add support for syncing ranges, support syncing for * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. * 08/12/11 beckyb Add highmem support */ #define pr_fmt(fmt) "software IO TLB: " fmt #include <linux/cache.h> #include <linux/cc_platform.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/io.h> #include <linux/iommu-helper.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <linux/rculist.h> #include <linux/scatterlist.h> #include <linux/set_memory.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/swiotlb.h> #include <linux/types.h> #ifdef CONFIG_DMA_RESTRICTED_POOL #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> #include <linux/slab.h> #endif #define CREATE_TRACE_POINTS #include <trace/events/swiotlb.h> #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* * Minimum IO TLB size to bother booting with. Systems with mainly * 64bit capable cards will only lightly use the swiotlb. If we can't * allocate a contiguous 1MB, we're probably in trouble anyway. */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) #define INVALID_PHYS_ADDR (~(phys_addr_t)0) /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; unsigned int list; }; static bool swiotlb_force_bounce; static bool swiotlb_force_disable; #ifdef CONFIG_SWIOTLB_DYNAMIC static void swiotlb_dyn_alloc(struct work_struct *work); static struct io_tlb_mem io_tlb_default_mem = { .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock), .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools), .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc, swiotlb_dyn_alloc), }; #else /* !CONFIG_SWIOTLB_DYNAMIC */ static struct io_tlb_mem io_tlb_default_mem; #endif /* CONFIG_SWIOTLB_DYNAMIC */ static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static unsigned long default_nareas; /** * struct io_tlb_area - IO TLB memory area descriptor * * This is a single area with a single lock. * * @used: The number of used IO TLB block. * @index: The slot index to start searching in this area for next round. * @lock: The lock to protect the above data structures in the map and * unmap calls. */ struct io_tlb_area { unsigned long used; unsigned int index; spinlock_t lock; }; /* * Round up number of slabs to the next power of 2. The last area is going * be smaller than the rest if default_nslabs is not power of two. * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, * otherwise a segment may span two or more areas. It conflicts with free * contiguous slots tracking: free slots are treated contiguous no matter * whether they cross an area boundary. * * Return true if default_nslabs is rounded up. */ static bool round_up_default_nslabs(void) { if (!default_nareas) return false; if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) default_nslabs = IO_TLB_SEGSIZE * default_nareas; else if (is_power_of_2(default_nslabs)) return false; default_nslabs = roundup_pow_of_two(default_nslabs); return true; } /** * swiotlb_adjust_nareas() - adjust the number of areas and slots * @nareas: Desired number of areas. Zero is treated as 1. * * Adjust the default number of areas in a memory pool. * The default size of the memory pool may also change to meet minimum area * size requirements. */ static void swiotlb_adjust_nareas(unsigned int nareas) { if (!nareas) nareas = 1; else if (!is_power_of_2(nareas)) nareas = roundup_pow_of_two(nareas); default_nareas = nareas; pr_info("area num %d.\n", nareas); if (round_up_default_nslabs()) pr_info("SWIOTLB bounce buffer size roundup to %luMB", (default_nslabs << IO_TLB_SHIFT) >> 20); } /** * limit_nareas() - get the maximum number of areas for a given memory pool size * @nareas: Desired number of areas. * @nslots: Total number of slots in the memory pool. * * Limit the number of areas to the maximum possible number of areas in * a memory pool of the given size. * * Return: Maximum possible number of areas. */ static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots) { if (nslots < nareas * IO_TLB_SEGSIZE) return nslots / IO_TLB_SEGSIZE; return nareas; } static int __init setup_io_tlb_npages(char *str) { if (isdigit(*str)) { /* avoid tail segment of size < IO_TLB_SEGSIZE */ default_nslabs = ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; if (isdigit(*str)) swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); if (*str == ',') ++str; if (!strcmp(str, "force")) swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) swiotlb_force_disable = true; return 0; } early_param("swiotlb", setup_io_tlb_npages); unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; } void __init swiotlb_adjust_size(unsigned long size) { /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); if (round_up_default_nslabs()) size = default_nslabs << IO_TLB_SHIFT; pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; if (!mem->nslabs) { pr_warn("No low mem\n"); return; } pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) { return val & (IO_TLB_SEGSIZE - 1); } static inline unsigned long nr_slots(u64 val) { return DIV_ROUND_UP(val, IO_TLB_SIZE); } /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called * before the SWIOTLB memory is used. */ void __init swiotlb_update_mem_attributes(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long bytes; if (!mem->nslabs || mem->late_alloc) return; bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT); } static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, unsigned long nslabs, bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; mem->late_alloc = late_alloc; mem->nareas = nareas; mem->area_nslabs = nslabs / mem->nareas; for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; mem->areas[i].used = 0; } for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i), mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } memset(vaddr, 0, bytes); mem->vaddr = vaddr; return; } /** * add_mem_pool() - add a memory pool to the allocator * @mem: Software IO TLB allocator. * @pool: Memory pool to be added. */ static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool) { #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock(&mem->lock); list_add_rcu(&pool->node, &mem->pools); mem->nslabs += pool->nslabs; spin_unlock(&mem->lock); #else mem->nslabs = pool->nslabs; #endif } static void __init *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); void *tlb; /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest * memory encryption. */ if (flags & SWIOTLB_ANY) tlb = memblock_alloc(bytes, PAGE_SIZE); else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", __func__, bytes); return NULL; } if (remap && remap(tlb, nslabs) < 0) { memblock_free(tlb, PAGE_ALIGN(bytes)); pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); return NULL; } return tlb; } /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs; unsigned int nareas; size_t alloc_size; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) return; if (swiotlb_force_disable) return; io_tlb_default_mem.force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (flags & SWIOTLB_ANY) io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); else io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT; #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; nareas = limit_nareas(default_nareas, nslabs); while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { if (nslabs <= IO_TLB_MIN_SLABS) return; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); } if (default_nslabs != nslabs) { pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs", default_nslabs, nslabs); default_nslabs = nslabs; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); if (!mem->slots) { pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); return; } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), nareas), SMP_CACHE_BYTES); if (!mem->areas) { pr_warn("%s: Failed to allocate mem->areas.\n", __func__); return; } swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas); add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); } void __init swiotlb_init(bool addressing_limit, unsigned int flags) { swiotlb_init_remap(addressing_limit, flags, NULL); } /* * Systems with larger DMA zones (those that don't support ISA) can * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned int nareas; unsigned char *vstart = NULL; unsigned int order, area_order; bool retried = false; int rc = 0; if (io_tlb_default_mem.nslabs) return 0; if (swiotlb_force_disable) return 0; io_tlb_default_mem.force_bounce = swiotlb_force_bounce; #ifdef CONFIG_SWIOTLB_DYNAMIC if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; order--; nslabs = SLABS_PER_PAGE << order; retried = true; } if (!vstart) return -ENOMEM; if (remap) rc = remap(vstart, nslabs); if (rc) { free_pages((unsigned long)vstart, order); nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); if (nslabs < IO_TLB_MIN_SLABS) return rc; retried = true; goto retry; } if (retried) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); } nareas = limit_nareas(default_nareas, nslabs); area_order = get_order(array_size(sizeof(*mem->areas), nareas)); mem->areas = (struct io_tlb_area *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); if (!mem->areas) goto error_area; mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); if (!mem->slots) goto error_slots; set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true, nareas); add_mem_pool(&io_tlb_default_mem, mem); swiotlb_print_info(); return 0; error_slots: free_pages((unsigned long)mem->areas, area_order); error_area: free_pages((unsigned long)vstart, order); return -ENOMEM; } void __init swiotlb_exit(void) { struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long tbl_vaddr; size_t tbl_size, slots_size; unsigned int area_order; if (swiotlb_force_bounce) return; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); tbl_vaddr = (unsigned long)phys_to_virt(mem->start); tbl_size = PAGE_ALIGN(mem->end - mem->start); slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { area_order = get_order(array_size(sizeof(*mem->areas), mem->nareas)); free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { memblock_free_late(__pa(mem->areas), array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } memset(mem, 0, sizeof(*mem)); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * alloc_dma_pages() - allocate pages to be used for DMA * @gfp: GFP flags for the allocation. * @bytes: Size of the buffer. * @phys_limit: Maximum allowed physical address of the buffer. * * Allocate pages from the buddy allocator. If successful, make the allocated * pages decrypted that they can be used for DMA. * * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN) * if the allocated physical address was above @phys_limit. */ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit) { unsigned int order = get_order(bytes); struct page *page; phys_addr_t paddr; void *vaddr; page = alloc_pages(gfp, order); if (!page) return NULL; paddr = page_to_phys(page); if (paddr + bytes - 1 > phys_limit) { __free_pages(page, order); return ERR_PTR(-EAGAIN); } vaddr = phys_to_virt(paddr); if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) goto error; return page; error: /* Intentional leak if pages cannot be encrypted again. */ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) __free_pages(page, order); return NULL; } /** * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer * @dev: Device for which a memory pool is allocated. * @bytes: Size of the buffer. * @phys_limit: Maximum allowed physical address of the buffer. * @gfp: GFP flags for the allocation. * * Return: Allocated pages, or %NULL on allocation failure. */ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, u64 phys_limit, gfp_t gfp) { struct page *page; /* * Allocate from the atomic pools if memory is encrypted and * the allocation is atomic, because decrypting may block. */ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) { void *vaddr; if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) return NULL; return dma_alloc_from_pool(dev, bytes, &vaddr, gfp, dma_coherent_ok); } gfp &= ~GFP_ZONEMASK; if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (__GFP_DMA32 | __GFP_DMA))) gfp |= __GFP_DMA32; else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & __GFP_DMA)) gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA; else return NULL; } return page; } /** * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer * @vaddr: Virtual address of the buffer. * @bytes: Size of the buffer. */ static void swiotlb_free_tlb(void *vaddr, size_t bytes) { if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(NULL, vaddr, bytes)) return; /* Intentional leak if pages cannot be encrypted again. */ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) __free_pages(virt_to_page(vaddr), get_order(bytes)); } /** * swiotlb_alloc_pool() - allocate a new IO TLB memory pool * @dev: Device for which a memory pool is allocated. * @minslabs: Minimum number of slabs. * @nslabs: Desired (maximum) number of slabs. * @nareas: Number of areas. * @phys_limit: Maximum DMA buffer physical address. * @gfp: GFP flags for the allocations. * * Allocate and initialize a new IO TLB memory pool. The actual number of * slabs may be reduced if allocation of @nslabs fails. If even * @minslabs cannot be allocated, this function fails. * * Return: New memory pool, or %NULL on allocation failure. */ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, unsigned long minslabs, unsigned long nslabs, unsigned int nareas, u64 phys_limit, gfp_t gfp) { struct io_tlb_pool *pool; unsigned int slot_order; struct page *tlb; size_t pool_size; size_t tlb_size; if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) { nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER; nareas = limit_nareas(nareas, nslabs); } pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); pool = kzalloc(pool_size, gfp); if (!pool) goto error; pool->areas = (void *)pool + sizeof(*pool); tlb_size = nslabs << IO_TLB_SHIFT; while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) { if (nslabs <= minslabs) goto error_tlb; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); nareas = limit_nareas(nareas, nslabs); tlb_size = nslabs << IO_TLB_SHIFT; } slot_order = get_order(array_size(sizeof(*pool->slots), nslabs)); pool->slots = (struct io_tlb_slot *) __get_free_pages(gfp, slot_order); if (!pool->slots) goto error_slots; swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas); return pool; error_slots: swiotlb_free_tlb(page_address(tlb), tlb_size); error_tlb: kfree(pool); error: return NULL; } /** * swiotlb_dyn_alloc() - dynamic memory pool allocation worker * @work: Pointer to dyn_alloc in struct io_tlb_mem. */ static void swiotlb_dyn_alloc(struct work_struct *work) { struct io_tlb_mem *mem = container_of(work, struct io_tlb_mem, dyn_alloc); struct io_tlb_pool *pool; pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs, default_nareas, mem->phys_limit, GFP_KERNEL); if (!pool) { pr_warn_ratelimited("Failed to allocate new pool"); return; } add_mem_pool(mem, pool); } /** * swiotlb_dyn_free() - RCU callback to free a memory pool * @rcu: RCU head in the corresponding struct io_tlb_pool. */ static void swiotlb_dyn_free(struct rcu_head *rcu) { struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu); size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs); size_t tlb_size = pool->end - pool->start; free_pages((unsigned long)pool->slots, get_order(slots_size)); swiotlb_free_tlb(pool->vaddr, tlb_size); kfree(pool); } /** * swiotlb_find_pool() - find the IO TLB pool for a physical address * @dev: Device which has mapped the DMA buffer. * @paddr: Physical address within the DMA buffer. * * Find the IO TLB memory pool descriptor which contains the given physical * address, if any. * * Return: Memory pool which contains @paddr, or %NULL if none. */ struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) { if (paddr >= pool->start && paddr < pool->end) goto out; } pool = NULL; out: rcu_read_unlock(); return pool; } /** * swiotlb_del_pool() - remove an IO TLB pool from a device * @dev: Owning device. * @pool: Memory pool to be removed. */ static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool) { unsigned long flags; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_del_rcu(&pool->node); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); call_rcu(&pool->rcu, swiotlb_dyn_free); } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /** * swiotlb_dev_init() - initialize swiotlb fields in &struct device * @dev: Device to be initialized. */ void swiotlb_dev_init(struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; #ifdef CONFIG_SWIOTLB_DYNAMIC INIT_LIST_HEAD(&dev->dma_io_tlb_pools); spin_lock_init(&dev->dma_io_tlb_lock); dev->dma_uses_io_tlb = false; #endif } /* * Return the offset into a iotlb slot required to keep the device happy. */ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) { return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); } /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) return; tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); orig_addr_offset = swiotlb_align_offset(dev, orig_addr); if (tlb_offset < orig_addr_offset) { dev_WARN_ONCE(dev, 1, "Access before mapping start detected. orig offset %u, requested offset %u.\n", orig_addr_offset, tlb_offset); return; } tlb_offset -= orig_addr_offset; if (tlb_offset > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", alloc_size, size, tlb_offset); return; } orig_addr += tlb_offset; alloc_size -= tlb_offset; if (size > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", alloc_size, size); size = alloc_size; } if (PageHighMem(pfn_to_page(pfn))) { unsigned int offset = orig_addr & ~PAGE_MASK; struct page *page; unsigned int sz = 0; unsigned long flags; while (size) { sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) memcpy_from_page(vaddr, page, offset, sz); else memcpy_to_page(page, offset, vaddr, sz); local_irq_restore(flags); size -= sz; pfn++; vaddr += sz; offset = 0; } } else if (dir == DMA_TO_DEVICE) { memcpy(vaddr, phys_to_virt(orig_addr), size); } else { memcpy(phys_to_virt(orig_addr), vaddr, size); } } static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) { return start + (idx << IO_TLB_SHIFT); } /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ static inline unsigned long get_max_slots(unsigned long boundary_mask) { return (boundary_mask >> IO_TLB_SHIFT) + 1; } static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index) { if (index >= mem->area_nslabs) return 0; return index; } /* * Track the total used slots with a global atomic value in order to have * correct information to determine the high water mark. The mem_used() * function gives imprecise results because there's no locking across * multiple areas. */ #ifdef CONFIG_DEBUG_FS static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { unsigned long old_hiwater, new_used; new_used = atomic_long_add_return(nslots, &mem->total_used); old_hiwater = atomic_long_read(&mem->used_hiwater); do { if (new_used <= old_hiwater) break; } while (!atomic_long_try_cmpxchg(&mem->used_hiwater, &old_hiwater, new_used)); } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { atomic_long_sub(nslots, &mem->total_used); } #else /* !CONFIG_DEBUG_FS */ static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) { } static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) { } #endif /* CONFIG_DEBUG_FS */ /** * swiotlb_area_find_slots() - search for slots in one IO TLB memory area * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @area_index: Index of the IO TLB memory area to be searched. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * * Find a suitable sequence of IO TLB entries for the request and allocate * a buffer from the given IO TLB memory area. * This function takes care of locking. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool, int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { struct io_tlb_area *area = pool->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) | alloc_align_mask; unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; unsigned int slot_index; BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); /* * For allocations of PAGE_SIZE or larger only look for page aligned * allocations. */ if (alloc_size >= PAGE_SIZE) iotlb_align_mask |= ~PAGE_MASK; iotlb_align_mask &= ~(IO_TLB_SIZE - 1); /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. */ stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) goto not_found; slot_base = area_index * pool->area_nslabs; index = area->index; for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { slot_index = slot_base + index; if (orig_addr && (slot_addr(tbl_dma_addr, slot_index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { index = wrap_area_index(pool, index + 1); slots_checked++; continue; } if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { if (pool->slots[slot_index].list >= nslots) goto found; } index = wrap_area_index(pool, index + stride); slots_checked += stride; } not_found: spin_unlock_irqrestore(&area->lock, flags); return -1; found: /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot onwards * and set the list of free entries to '0' indicating unavailable. */ for (i = slot_index; i < slot_index + nslots; i++) { pool->slots[i].list = 0; pool->slots[i].alloc_size = alloc_size - (offset + ((i - slot_index) << IO_TLB_SHIFT)); } for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && pool->slots[i].list; i--) pool->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ area->index = wrap_area_index(pool, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots); return slot_index; } /** * swiotlb_pool_find_slots() - search for slots in one memory pool * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * * Search through one memory pool to find a sequence of slots that match the * allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { int start = raw_smp_processor_id() & (pool->nareas - 1); int i = start, index; do { index = swiotlb_area_find_slots(dev, pool, i, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) return index; if (++i >= pool->nareas) i = 0; } while (i != start); return -1; } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_find_slots() - search for slots in the whole swiotlb * @dev: Device which maps the buffer. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. * @retpool: Used memory pool, updated on return. * * Search through the whole software IO TLB to find a sequence of slots that * match the allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; unsigned long nslabs; unsigned long flags; u64 phys_limit; int index; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) { index = swiotlb_pool_find_slots(dev, pool, orig_addr, alloc_size, alloc_align_mask); if (index >= 0) { rcu_read_unlock(); goto found; } } rcu_read_unlock(); if (!mem->can_grow) return -1; schedule_work(&mem->dyn_alloc); nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, GFP_NOWAIT | __GFP_NOWARN); if (!pool) return -1; index = swiotlb_pool_find_slots(dev, pool, orig_addr, alloc_size, alloc_align_mask); if (index < 0) { swiotlb_dyn_free(&pool->rcu); return -1; } pool->transient = true; spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); list_add_rcu(&pool->node, &dev->dma_io_tlb_pools); spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); found: WRITE_ONCE(dev->dma_uses_io_tlb, true); /* * The general barrier orders reads and writes against a presumed store * of the SWIOTLB buffer address by a device driver (to a driver private * data structure). It serves two purposes. * * First, the store to dev->dma_uses_io_tlb must be ordered before the * presumed store. This guarantees that the returned buffer address * cannot be passed to another CPU before updating dev->dma_uses_io_tlb. * * Second, the load from mem->pools must be ordered before the same * presumed store. This guarantees that the returned buffer address * cannot be observed by another CPU before an update of the RCU list * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy * atomicity). * * See also the comment in is_swiotlb_buffer(). */ smp_mb(); *retpool = pool; return index; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { *retpool = &dev->dma_io_tlb_mem->defpool; return swiotlb_pool_find_slots(dev, *retpool, orig_addr, alloc_size, alloc_align_mask); } #endif /* CONFIG_SWIOTLB_DYNAMIC */ #ifdef CONFIG_DEBUG_FS /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is accurate in this version of the function, because an atomic * counter is available if CONFIG_DEBUG_FS is set. * * Return: Number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { return atomic_long_read(&mem->total_used); } #else /* !CONFIG_DEBUG_FS */ /** * mem_pool_used() - get number of used slots in a memory pool * @pool: Software IO TLB memory pool. * * The result is not accurate, see mem_used(). * * Return: Approximate number of used slots. */ static unsigned long mem_pool_used(struct io_tlb_pool *pool) { int i; unsigned long used = 0; for (i = 0; i < pool->nareas; i++) used += pool->areas[i].used; return used; } /** * mem_used() - get number of used slots in an allocator * @mem: Software IO TLB allocator. * * The result is not accurate, because there is no locking of individual * areas. * * Return: Approximate number of used slots. */ static unsigned long mem_used(struct io_tlb_mem *mem) { #ifdef CONFIG_SWIOTLB_DYNAMIC struct io_tlb_pool *pool; unsigned long used = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &mem->pools, node) used += mem_pool_used(pool); rcu_read_unlock(); return used; #else return mem_pool_used(&mem->defpool); #endif } #endif /* CONFIG_DEBUG_FS */ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, unsigned int alloc_align_mask, enum dma_data_direction dir, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); return (phys_addr_t)DMA_MAPPING_ERROR; } if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", mapping_size, alloc_size); return (phys_addr_t)DMA_MAPPING_ERROR; } index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", alloc_size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; /* * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy * the original buffer to the TLB buffer before initiating DMA in order * to preserve the original's data if the device does a partial write, * i.e. if the device doesn't overwrite the entire buffer. Preserving * the original data, even if it's garbage, is necessary to match * hardware behavior. Use of swiotlb is supposed to be transparent, * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes. */ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); int aindex = index / mem->area_nslabs; struct io_tlb_area *area = &mem->areas[aindex]; int count, i; /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ BUG_ON(aindex >= mem->nareas); spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else count = 0; /* * Step 1: return the slots to the free list, merging the slots with * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } /* * Step 2: merge the returned slots with the preceding slots, if * available (non zero) */ for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); dec_used(dev->dma_io_tlb_mem, nslots); } #ifdef CONFIG_SWIOTLB_DYNAMIC /** * swiotlb_del_transient() - delete a transient memory pool * @dev: Device which mapped the buffer. * @tlb_addr: Physical address within a bounce buffer. * * Check whether the address belongs to a transient SWIOTLB memory pool. * If yes, then delete the pool. * * Return: %true if @tlb_addr belonged to a transient pool that was released. */ static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *pool; pool = swiotlb_find_pool(dev, tlb_addr); if (!pool->transient) return false; dec_used(dev->dma_io_tlb_mem, pool->nslabs); swiotlb_del_pool(dev, pool); return true; } #else /* !CONFIG_SWIOTLB_DYNAMIC */ static inline bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) { return false; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ /* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { /* * First, sync the memory before unmapping the entry */ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); if (swiotlb_del_transient(dev, tlb_addr)) return; swiotlb_release_slots(dev, tlb_addr); } void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); else BUG_ON(dir != DMA_FROM_DEVICE); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE); else BUG_ON(dir != DMA_TO_DEVICE); } /* * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t swiotlb_addr; dma_addr_t dma_addr; trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(swiotlb_addr, size, dir); return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) { int min_align_mask = dma_get_min_align_mask(dev); int min_align = 0; /* * swiotlb_find_slots() skips slots according to * min align mask. This affects max mapping size. * Take it into acount here. */ if (min_align_mask) min_align = roundup(min_align_mask, IO_TLB_SIZE); return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } /** * is_swiotlb_allocated() - check if the default software IO TLB is initialized */ bool is_swiotlb_allocated(void) { return io_tlb_default_mem.nslabs; } bool is_swiotlb_active(struct device *dev) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && mem->nslabs; } /** * default_swiotlb_base() - get the base address of the default SWIOTLB * * Get the lowest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_base(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC io_tlb_default_mem.can_grow = false; #endif return io_tlb_default_mem.defpool.start; } /** * default_swiotlb_limit() - get the address limit of the default SWIOTLB * * Get the highest physical address used by the default software IO TLB pool. */ phys_addr_t default_swiotlb_limit(void) { #ifdef CONFIG_SWIOTLB_DYNAMIC return io_tlb_default_mem.phys_limit; #else return io_tlb_default_mem.defpool.end - 1; #endif } #ifdef CONFIG_DEBUG_FS static int io_tlb_used_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = mem_used(mem); return 0; } static int io_tlb_hiwater_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; *val = atomic_long_read(&mem->used_hiwater); return 0; } static int io_tlb_hiwater_set(void *data, u64 val) { struct io_tlb_mem *mem = data; /* Only allow setting to zero */ if (val != 0) return -EINVAL; atomic_long_set(&mem->used_hiwater, val); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, io_tlb_hiwater_set, "%llu\n"); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { atomic_long_set(&mem->total_used, 0); atomic_long_set(&mem->used_hiwater, 0); mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); } static int __init swiotlb_create_default_debugfs(void) { swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } late_initcall(swiotlb_create_default_debugfs); #else /* !CONFIG_DEBUG_FS */ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { } #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DMA_RESTRICTED_POOL struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; phys_addr_t tlb_addr; int index; if (!mem) return NULL; index = swiotlb_find_slots(dev, 0, size, 0, &pool); if (index == -1) return NULL; tlb_addr = slot_addr(pool->start, index); return pfn_to_page(PFN_DOWN(tlb_addr)); } bool swiotlb_free(struct device *dev, struct page *page, size_t size) { phys_addr_t tlb_addr = page_to_phys(page); if (!is_swiotlb_buffer(dev, tlb_addr)) return false; swiotlb_release_slots(dev, tlb_addr); return true; } static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct device *dev) { struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; /* Set Per-device io tlb area to one */ unsigned int nareas = 1; if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping."); return -EINVAL; } /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached * to it. */ if (!mem) { struct io_tlb_pool *pool; mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; pool = &mem->defpool; pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL); if (!pool->slots) { kfree(mem); return -ENOMEM; } pool->areas = kcalloc(nareas, sizeof(*pool->areas), GFP_KERNEL); if (!pool->areas) { kfree(pool->slots); kfree(mem); return -ENOMEM; } set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs, false, nareas); mem->force_bounce = true; mem->for_alloc = true; #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock_init(&mem->lock); #endif add_mem_pool(mem, pool); rmem->priv = mem; swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; return 0; } static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { .device_init = rmem_swiotlb_device_init, .device_release = rmem_swiotlb_device_release, }; static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) { unsigned long node = rmem->fdt_node; if (of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "linux,cma-default", NULL) || of_get_flat_dt_prop(node, "linux,dma-default", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */ |
| 3842 3840 170 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. * * Lock order: * context.ldt_usr_sem * mmap_lock * context.lock */ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/ldt.h> #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/pgtable_areas.h> #include <xen/xen.h> /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) static inline void *ldt_slot_va(int slot) { return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); } void load_mm_ldt(struct mm_struct *mm) { struct ldt_struct *ldt; /* READ_ONCE synchronizes with smp_store_release */ ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all * CPUs with the mm active. The LDT will not be freed until * after the IPI is handled by all such CPUs. This means that * if the ldt_struct changes before we return, the values we see * will be safe, and the new values will be loaded before we run * any user code. * * NB: don't try to convert this to use RCU without extreme care. * We would still need IRQs off, because we don't want to change * the local LDT after an IPI loaded a newer value than the one * that we can see. */ if (unlikely(ldt)) { if (static_cpu_has(X86_FEATURE_PTI)) { if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { /* * Whoops -- either the new LDT isn't mapped * (if slot == -1) or is mapped into a bogus * slot (if slot > 1). */ clear_LDT(); return; } /* * If page table isolation is enabled, ldt->entries * will not be mapped in the userspace pagetables. * Tell the CPU to access the LDT through the alias * at ldt_slot_va(ldt->slot). */ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); } else { set_ldt(ldt->entries, ldt->nr_entries); } } else { clear_LDT(); } } void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { /* * Load the LDT if either the old or new mm had an LDT. * * An mm will never go from having an LDT to not having an LDT. Two * mms never share an LDT, so we don't gain anything by checking to * see whether the LDT changed. There's also no guarantee that * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, * then prev->context.ldt will also be non-NULL. * * If we really cared, we could optimize the case where prev == next * and we're exiting lazy mode. Most of the time, if this happens, * we don't actually need to reload LDTR, but modify_ldt() is mostly * used by legacy code and emulators where we don't need this level of * performance. * * This uses | instead of || because it generates better code. */ if (unlikely((unsigned long)prev->context.ldt | (unsigned long)next->context.ldt)) load_mm_ldt(next); DEBUG_LOCKS_WARN_ON(preemptible()); } static void refresh_ldt_segments(void) { #ifdef CONFIG_X86_64 unsigned short sel; /* * Make sure that the cached DS and ES descriptors match the updated * LDT. */ savesegment(ds, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(ds, sel); savesegment(es, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(es, sel); #endif } /* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; load_mm_ldt(mm); refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no * trailing nonzero bytes in any page that contains LDT descriptors. * Keep it simple: zero the whole allocation and never allocate less * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); return NULL; } /* The new LDT isn't aliased for PTI yet. */ new_ldt->slot = -1; new_ldt->nr_entries = num_entries; return new_ldt; } #ifdef CONFIG_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, bool had_user_mapping) { if (mm->context.ldt) { /* * We already had an LDT. The top-level entry should already * have been allocated and synchronized with the usermode * tables. */ WARN_ON(!had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* * This is the first time we're mapping an LDT for this process. * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } #ifdef CONFIG_X86_PAE static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) { p4d_t *p4d; pud_t *pud; if (pgd->pgd == 0) return NULL; p4d = p4d_offset(pgd, va); if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, va); if (pud_none(*pud)) return NULL; return pmd_offset(pud, va); } static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); bool had_kernel, had_user; pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); had_kernel = (k_pmd->pmd != 0); had_user = (u_pmd->pmd != 0); do_sanity_check(mm, had_kernel, had_user); } #else /* !CONFIG_X86_PAE */ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); bool had_kernel = (pgd->pgd != 0); bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); do_sanity_check(mm, had_kernel, had_user); } #endif /* CONFIG_X86_PAE */ /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { unsigned long va; bool is_vmalloc; spinlock_t *ptl; int i, nr_pages; if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* * Any given ldt_struct should have map_ldt_struct() called at most * once. */ WARN_ON(ldt->slot != -1); /* Check if the current mappings are sane */ sanity_check_ldt_mapping(mm); is_vmalloc = is_vmalloc_addr(ldt->entries); nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; pgprot_t pte_prot; pte_t pte, *ptep; va = (unsigned long)ldt_slot_va(slot) + offset; pfn = is_vmalloc ? vmalloc_to_pfn(src) : page_to_pfn(virt_to_page(src)); /* * Treat the PTI LDT range as a *userspace* range. * get_locked_pte() will allocate all needed pagetables * and account for them in this mm. */ ptep = get_locked_pte(mm, va, &ptl); if (!ptep) return -ENOMEM; /* * Map it RO so the easy to find address is not a primary * target via some kernel interface which misses a * permission check. */ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); /* Filter out unsuppored __PAGE_KERNEL* bits: */ pgprot_val(pte_prot) &= __supported_pte_mask; pte = pfn_pte(pfn, pte_prot); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } /* Propagate LDT mapping to the user page-table */ map_ldt_struct_to_user(mm); ldt->slot = slot; return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { unsigned long va; int i, nr_pages; if (!ldt) return; /* LDT map/unmap is only required for PTI */ if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; spinlock_t *ptl; pte_t *ptep; va = (unsigned long)ldt_slot_va(ldt->slot) + offset; ptep = get_locked_pte(mm, va, &ptl); if (!WARN_ON_ONCE(!ptep)) { pte_clear(mm, va, ptep); pte_unmap_unlock(ptep, ptl); } } va = (unsigned long)ldt_slot_va(ldt->slot); flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } #else /* !CONFIG_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } #endif /* CONFIG_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * Although free_pgd_range() is intended for freeing user * page-tables, it also works out for kernel mappings on x86. * We use tlb_gather_mmu_fullmm() to avoid confusing the * range-tracking logic in __tlb_adjust_range(). */ tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); tlb_finish_mmu(&tlb); #endif } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { mutex_lock(&mm->context.lock); /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(&mm->context.ldt, ldt); /* Activate the LDT for all CPUs using currents mm. */ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) { if (likely(!ldt)) return; paravirt_free_ldt(ldt->entries, ldt->nr_entries); if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); } /* * Called on fork from arch_dup_mmap(). Just copy the current LDT state, * the new task is not running, so nothing can be installed. */ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; int retval = 0; if (!old_mm) return 0; mutex_lock(&old_mm->context.lock); if (!old_mm->context.ldt) goto out_unlock; new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); retval = map_ldt_struct(mm, new_ldt, 0); if (retval) { free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } mm->context.ldt = new_ldt; out_unlock: mutex_unlock(&old_mm->context.lock); return retval; } /* * No need to lock the MM as we are the last user * * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context_ldt(struct mm_struct *mm) { free_ldt_struct(mm->context.ldt); mm->context.ldt = NULL; } void ldt_arch_exit_mmap(struct mm_struct *mm) { free_ldt_pgtables(mm); } static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; goto out_unlock; } if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; if (entries_size > bytecount) entries_size = bytecount; if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } } retval = bytecount; out_unlock: up_read(&mm->context.ldt_usr_sem); return retval; } static int read_default_ldt(void __user *ptr, unsigned long bytecount) { /* CHECKME: Can we use _one_ random number ? */ #ifdef CONFIG_X86_32 unsigned long size = 5 * sizeof(struct desc_struct); #else unsigned long size = 128; #endif if (bytecount > size) bytecount = size; if (clear_user(ptr, bytecount)) return -EFAULT; return bytecount; } static bool allow_16bit_segments(void) { if (!IS_ENABLED(CONFIG_X86_16BIT)) return false; #ifdef CONFIG_XEN_PV /* * Xen PV does not implement ESPFIX64, which means that 16-bit * segments will not work correctly. Until either Xen PV implements * ESPFIX64 and can signal this fact to the guest or unless someone * provides compelling evidence that allowing broken 16-bit segments * is worthwhile, disallow 16-bit segments under Xen PV. */ if (xen_pv_domain()) { pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); return false; } #endif return true; } static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; error = -EINVAL; if (ldt_info.entry_number >= LDT_ENTRIES) goto out; if (ldt_info.contents == 3) { if (oldmode) goto out; if (ldt_info.seg_not_present == 0) goto out; } if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || LDT_empty(&ldt_info)) { /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; } if (down_write_killable(&mm->context.ldt_usr_sem)) return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); /* * If we are using PTI, map the new LDT into the userspace pagetables. * If there is already an LDT, use the other slot so that other CPUs * will continue to use the old LDT until install_ldt() switches * them over to the new LDT. */ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); if (error) { /* * This only can fail for the first LDT setup. If an LDT is * already installed then the PTE page is already * populated. Mop up a half populated page table. */ if (!WARN_ON_ONCE(old_ldt)) free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } install_ldt(mm, new_ldt); unmap_ldt_struct(mm, old_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: up_write(&mm->context.ldt_usr_sem); out: return error; } SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , unsigned long , bytecount) { int ret = -ENOSYS; switch (func) { case 0: ret = read_ldt(ptr, bytecount); break; case 1: ret = write_ldt(ptr, bytecount, 1); break; case 2: ret = read_default_ldt(ptr, bytecount); break; case 0x11: ret = write_ldt(ptr, bytecount, 0); break; } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' * return type, but the ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative * return codes i |