191 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) { kasan_check_read(addr, 1); return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */ |
191 192 941 940 190 192 192 192 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 | // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { int cpu; blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); if (!blkcg->lhead) return -ENOMEM; for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); return 0; } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { struct cgroup_subsys_state *css; css = kthread_blkcg(); if (css) return css; return task_css(current, io_cgrp_id); } static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion * of the list blkg->q_node is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { if (!blkg) return; /* * Both ->pd_free_fn() and request queue's release handler may * sleep, so free us by scheduling one work func */ INIT_WORK(&blkg->free_work, blkg_free_workfn); schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg *blkcg = blkg->blkcg; int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif /* * Flush all the non-empty percpu lockless lists before releasing * us, given these stat belongs to us. * * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; if (blkg->parent) { spin_lock(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); } else { /* never bounce for the root cgroup */ submit_bio(bio); } } EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); static int __init blkcg_punt_bio_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { if (!bio || !bio->bi_blkg) return NULL; return &bio->bi_blkg->blkcg->css; } EXPORT_SYMBOL_GPL(bio_blkcg_css); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); #endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; pd->online = false; } return blkg; out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); out_free_blkg: kfree(blkg); return NULL; } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i]) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); blkg->pd[i]->online = true; } } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* * blkg stays on the queue list until blkg_free_workfn(), see details in * blkg_free_workfn(), hence this function can be called from * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before * blkg_free_workfn(). */ if (hlist_unhashed(&blkg->blkcg_node)) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && blkg->pd[i]->online) { blkg->pd[i]->online = false; if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void __blkg_clear_stat(struct blkg_iostat_set *bis) { struct blkg_iostat cur = {0}; unsigned long flags; flags = u64_stats_update_begin_irqsave(&bis->sync); blkg_iostat_set(&bis->cur, &cur); blkg_iostat_set(&bis->last, &cur); u64_stats_update_end_irqrestore(&bis->sync, flags); } static void blkg_clear_stat(struct blkcg_gq *blkg) { int cpu; for_each_possible_cpu(cpu) { struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); __blkg_clear_stat(s); } __blkg_clear_stat(&blkg->iostat); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { *ctx = (struct blkg_conf_ctx){ .input = input }; } EXPORT_SYMBOL_GPL(blkg_conf_init); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; if (ctx->bdev) return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; input += key_len; if (!isspace(*input)) return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return -ENODEV; } mutex_lock(&bdev->bd_queue->rq_qos_mutex); if (!disk_live(bdev->bd_disk)) { blkdev_put_no_open(bdev); mutex_unlock(&bdev->bd_queue->rq_qos_mutex); return -ENODEV; } ctx->body = input; ctx->bdev = bdev; return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; ret = blkg_conf_open_bdev(ctx); if (ret) return ret; disk = ctx->bdev->bd_disk; q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). */ ret = blk_queue_enter(q, 0); if (ret) goto fail; spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(blkcg, q); if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit_queue; } spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: blk_queue_exit(q); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); fail_exit_queue: blk_queue_exit(q); fail: /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ctx->blkg = NULL; } if (ctx->bdev) { mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; } } EXPORT_SYMBOL_GPL(blkg_conf_exit); static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { struct blkg_iostat delta; unsigned long flags; /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, cur); blkg_iostat_sub(&delta, last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; unsigned long flags; rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; /* * For covering concurrent parent blkg update from blkg_release(). * * When flushing from cgroup, cgroup_rstat_lock is always held, so * this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. */ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; struct blkg_iostat cur; unsigned int seq; /* * Order assignment of `next_bisc` from `bisc->lnode.next` in * llist_for_each_entry_safe and clearing `bisc->lqueued` for * avoiding to assign `next_bisc` with new next pointer added * in blk_cgroup_bio_start() in case of re-ordering. * * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). */ smp_mb(); WRITE_ONCE(bisc->lqueued, false); if (bisc == &blkg->iostat) goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); blkcg_iostat_update(blkg, &cur, &bisc->last); propagate_up: /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); /* * Queue parent->iostat to its blkcg's lockless * list to propagate up to the grandparent if the * iostat hasn't been queued yet. */ if (!parent->iostat.lqueued) { struct llist_head *plhead; plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); llist_add(&parent->iostat.lnode, plhead); parent->iostat.lqueued = true; } } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { /* Root-level stats are sourced from system-wide IO stats */ if (cgroup_parent(css->cgroup)) __blkcg_rstat_flush(css_to_blkcg(css), cpu); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, cgroup_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; pol->pd_stat_fn(blkg->pd[i], s); } seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { return &css_to_blkcg(css)->cgwb_list; } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { struct blkcg *parent; if (!refcount_dec_and_test(&blkcg->online_pin)) break; parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); blkcg = parent; } while (blkcg); } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); free_percpu(blkcg->lhead); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) goto unlock; } if (init_blkcg_llists(blkcg)) goto free_blkcg; for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto free_pd_blkcg; blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_percpu(blkcg->lhead); free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(&parent->css); return 0; } void blkg_init_queue(struct request_queue *q) { INIT_LIST_HEAD(&q->blkg_list); mutex_init(&q->blkcg_mutex); } int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return 0; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) put_disk(tsk->throttle_disk); tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; /* * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, * for example, ioprio. Such policy will work on blkcg level, not disk * level, and don't need to be activated. */ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) return -EINVAL; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } spin_lock(&blkg->blkcg->lock); pd->blkg = blkg; pd->plid = pol->plid; blkg->pd[pol->plid] = pd; if (pol->pd_init_fn) pol->pd_init_fn(pd); if (pol->pd_online_fn) pol->pd_online_fn(pd); pd->online = true; spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; spin_lock(&blkcg->lock); pd = blkg->pd[pol->plid]; if (pd) { if (pd->online && pol->pd_offline_fn) pol->pd_offline_fn(pd); pd->online = false; pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { struct blkcg *blkcg; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; } /* * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy * without pd_alloc_fn/pd_free_fn can't be activated. */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto err_free_cpds; blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes) WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!disk) return; current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); put_disk(disk); return; out: rcu_read_unlock(); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_disk != disk) { if (test_bit(GD_DEAD, &disk->state)) return; get_device(disk_to_dev(disk)); if (current->throttle_disk) put_disk(current->throttle_disk); current->throttle_disk = disk; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; if (blk_op_is_passthrough(bio->bi_opf)) return; rcu_read_lock(); if (bio->bi_blkg) css = bio_blkcg_css(bio); else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) return; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; /* * If the iostat_cpu isn't in a lockless list, put it into the * list to indicate that a stat update is pending. */ if (!READ_ONCE(bis->lqueued)) { struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); } u64_stats_update_end_irqrestore(&bis->sync, flags); cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } bool blk_cgroup_congested(void) { struct blkcg *blkcg; bool ret = false; rcu_read_lock(); for (blkcg = css_to_blkcg(blkcg_css()); blkcg; blkcg = blkcg_parent(blkcg)) { if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } } rcu_read_unlock(); return ret; } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2018 - Arm Ltd */ #ifndef __ARM64_KVM_RAS_H__ #define __ARM64_KVM_RAS_H__ #include <linux/acpi.h> #include <linux/errno.h> #include <linux/types.h> #include <asm/acpi.h> /* * Was this synchronous external abort a RAS notification? * Returns '0' for errors handled by some RAS subsystem, or -ENOENT. */ static inline int kvm_handle_guest_sea(phys_addr_t addr, u64 esr) { /* apei_claim_sea(NULL) expects to mask interrupts itself */ lockdep_assert_irqs_enabled(); return apei_claim_sea(NULL); } #endif /* __ARM64_KVM_RAS_H__ */ |
8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/bug.h> #include "printk_ringbuffer.h" #include "internal.h" /** * DOC: printk_ringbuffer overview * * Data Structure * -------------- * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * * Implementation * -------------- * * Descriptor Ring * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using * blk_lpos structs pointing to associated text data blocks (see * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically * updated. * * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed * The record and all its data are written. A writer can reopen the * descriptor (transitioning it back to reserved), but in the committed * state the data is consistent. * * finalized * The record and all its data are complete and available for reading. A * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or meta data may no longer be * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. * * The descriptor ring has a @tail_id that contains the ID of the oldest * descriptor and @head_id that contains the ID of the newest descriptor. * * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. * * The @tail_id can only be advanced if the new @tail_id would be in the * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * * Descriptor Finalization * ~~~~~~~~~~~~~~~~~~~~~~~ * When a writer calls the commit function prb_commit(), record data is * fully stored and is consistent within the ringbuffer. However, a writer can * reopen that record, claiming exclusive access (as with prb_reserve()), and * modify that record. When finished, the writer must again commit the record. * * In order for a record to be made available to readers (and also become * recyclable for writers), it must be finalized. A finalized record cannot be * reopened and can never become "unfinalized". Record finalization can occur * in three different scenarios: * * 1) A writer can simultaneously commit and finalize its record by calling * prb_final_commit() instead of prb_commit(). * * 2) When a new record is reserved and the previous record has been * committed via prb_commit(), that previous record is automatically * finalized. * * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * * Data Ring * ~~~~~~~~~ * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte * array ringbuffer. * * Each data block consists of an ID followed by the writer data. The ID is * the identifier of a descriptor that is associated with the data block. A * given data block is considered valid if all of the following conditions * are met: * * 1) The descriptor associated with the data block is in the committed * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. * * 3) The data block is within the head/tail logical position range. * * If the writer data of a data block would extend beyond the end of the * byte array, only the ID of the data block is stored at the logical * position and the full data block (ID and writer data) is stored at the * beginning of the byte array. The referencing blk_lpos will point to the * ID before the wrap and the next data block will be at the logical * position adjacent the full data block after the wrap. * * Data rings have a @tail_lpos that points to the beginning of the oldest * data block and a @head_lpos that points to the logical position of the * next (not yet existing) data block. * * When a new data block should be created (and the ring is full), tail data * blocks will first be invalidated by putting their associated descriptors * into the reusable state and then pushing the @tail_lpos forward beyond * them. Then the @head_lpos is pushed forward and is associated with a new * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * * Info Array * ~~~~~~~~~~ * The general meta data of printk records are stored in printk_info structs, * stored in an array with the same number of elements as the descriptor ring. * Each info corresponds to the descriptor of the same index in the * descriptor ring. Info validity is confirmed by evaluating the corresponding * descriptor before and after loading the info. * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit and finalize the record * prb_final_commit(&e); * } * * Note that additional writer functions are available to extend a record * after it has been committed but not yet finalized. This can be done as * long as no new records have been reserved and the caller is the same. * * Sample writer code (record extending):: * * // alternate rest of previous example * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) * prb_commit(&e); * } * * ... * * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * * // try to extend, but only if it does not exceed 32 bytes * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * * r.info->text_len += 5; * * // commit and finalize the record * prb_final_commit(&e); * } * * Sample reader code:: * * struct printk_info info; * struct printk_record r; * char text_buf[32]; * u64 seq; * * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) * pr_warn("lost %llu records\n", info.seq - seq); * * if (info.text_len > r.text_buf_size) { * pr_warn("record %llu text truncated\n", info.seq); * text_buf[r.text_buf_size - 1] = 0; * } * * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to * allow complex record access. * * ABA Issues * ~~~~~~~~~~ * To help avoid ABA issues, descriptors are referenced by IDs (array index * values combined with tagged bits counting array wraps) and data blocks are * referenced by logical positions (array index values combined with tagged * bits counting array wraps). However, on 32-bit systems the number of * tagged bits is relatively small such that an ABA incident is (at least * theoretically) possible. For example, if 4 million maximally sized (1KiB) * printk messages were to occur in NMI context on a 32-bit system, the * interrupted context would not be able to recognize that the 32-bit integer * completely wrapped and thus represents a different data block than the one * the interrupted context expects. * * To help combat this possibility, additional state checking is performed * (such as using cmpxchg() even though set() would suffice). These extra * checks are commented as such and will hopefully catch any ABA issue that * a 32-bit system might experience. * * Memory Barriers * ~~~~~~~~~~~~~~~ * Multiple memory barriers are used. To simplify proving correctness and * generating litmus tests, lines of code related to memory barriers * (loads, stores, and the associated memory barriers) are labeled:: * * LMM(function:letter) * * Comments reference the labels using only the "function:letter" part. * * The memory barrier pairs and their ordering are: * * desc_reserve:D / desc_reserve:B * push descriptor tail (id), then push descriptor head (id) * * desc_reserve:D / data_push_tail:B * push data tail (lpos), then set new descriptor reserved (state) * * desc_reserve:D / desc_push_tail:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:D / prb_first_seq:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * * desc_reopen_last:A / _prb_commit:B * set descriptor reserved (state), then read descriptor data * * _prb_commit:B / desc_reserve:D * set new descriptor committed (state), then check descriptor head (id) * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * * desc_push_tail:B / desc_reserve:D * set descriptor reusable (state), then push descriptor tail (id) * * desc_update_last_finalized:A / desc_last_finalized_seq:A * store finalized record, then set new highest finalized sequence number */ #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) #define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) #define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) #define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) /* Determine the data array index from a logical position. */ #define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) /* Determine the desc array index from an ID or sequence number. */ #define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) #define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) /* Get the ID for the same index of the previous wrap as the given ID. */ #define DESC_ID_PREV_WRAP(desc_ring, id) \ DESC_ID((id) - DESCS_COUNT(desc_ring)) /* * A data block: mapped directly to the beginning of the data block area * specified as a logical position within the data ring. * * @id: the ID of the associated descriptor * @data: the writer data * * Note that the size of a data block is only known by its associated * descriptor. */ struct prb_data_block { unsigned long id; char data[]; }; /* * Return the descriptor associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } /* * Return the printk_info associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; } static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; } /* * Increase the data size to account for data block meta data plus any * padding so that the adjacent data block is aligned on the ID size. */ static unsigned int to_blk_size(unsigned int size) { struct prb_data_block *db = NULL; size += sizeof(*db); size = ALIGN(size, sizeof(db->id)); return size; } /* * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { struct prb_data_block *db = NULL; if (size == 0) return true; /* * Ensure the alignment padded size could possibly fit in the data * array. The largest possible data block must still leave room for * at least the ID of the next block. */ size = to_blk_size(size); if (size > DATA_SIZE(data_ring) - sizeof(db->id)) return false; return true; } /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) { if (id != DESC_ID(state_val)) return desc_miss; return DESC_STATE(state_val); } /* * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. * * The sequence number and caller_id can be optionally retrieved. Like all * non-state_var data, they are only valid if the descriptor is in a * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out, u64 *seq_out, u32 *caller_id_out) { struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; unsigned long state_val; /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); if (d_state == desc_miss || d_state == desc_reserved) { /* * The descriptor is in an inconsistent state. Set at least * @state_var so that the caller can see the details of * the inconsistent state. */ goto out; } /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_read:A reads from _prb_commit:B, then desc_read:C reads * from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ smp_rmb(); /* LMM(desc_read:B) */ /* * Copy the descriptor data. The data is not valid until the * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ if (desc_out) { memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking * the state. This avoids reading an obsolete descriptor state * that may not apply to the copied content. This pairs with * desc_reserve:F. * * Memory barrier involvement: * * If desc_read:C reads from desc_reserve:G, then desc_read:E * reads from desc_reserve:F. * * Relies on: * * WMB from desc_reserve:F to desc_reserve:G * matching * RMB from desc_read:C to desc_read:E * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may * not apply to the copied data. This pairs with data_alloc:A and * data_realloc:A. * * Memory barrier involvement: * * If copy_data:A reads from data_alloc:B, then desc_read:E * reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_alloc:B * matching * RMB from desc_read:C to desc_read:E * * Note: desc_make_reusable:A and data_alloc:B can be different * CPUs. However, the data_alloc:B CPU (which performs the * full memory barrier) must have previously seen * desc_make_reusable:A. */ smp_rmb(); /* LMM(desc_read:D) */ /* * The data has been copied. Return the current descriptor state, * which may have changed since the load above. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: if (desc_out) atomic_long_set(&desc_out->state_var, state_val); return d_state; } /* * Take a specified descriptor out of the finalized state by attempting * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } /* * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long lpos_begin, unsigned long lpos_end, unsigned long *lpos_out) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); /* * Load the block ID from the data block. This is a data race * against a writer that may have newly reserved this data * area. If the loaded value matches a valid descriptor ID, * the blk_lpos of that descriptor will be checked to make * sure it points back to this data block. If the check fails, * the data area has been recycled by another writer. */ id = blk->id; /* LMM(data_make_reusable:A) */ d_state = desc_read(desc_ring, id, &desc, NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: case desc_reserved: case desc_committed: return false; case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; desc_make_reusable(desc_ring, id); break; case desc_reusable: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; break; } /* Advance @lpos_begin to the next data block. */ lpos_begin = blk_lpos->next; } *lpos_out = lpos_begin; return true; } /* * Advance the data ring tail to at least @lpos. This function puts * descriptors into the reusable state if the tail is pushed beyond * their associated data block. */ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) { struct prb_data_ring *data_ring = &rb->text_data_ring; unsigned long tail_lpos_new; unsigned long tail_lpos; unsigned long next_lpos; /* If @lpos is from a data-less block, there is nothing to do. */ if (LPOS_DATALESS(lpos)) return true; /* * Any descriptor states that have transitioned to reusable due to the * data tail being pushed to this loaded value will be visible to this * CPU. This pairs with data_push_tail:D. * * Memory barrier involvement: * * If data_push_tail:A reads from data_push_tail:D, then this CPU can * see desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_push_tail:D * matches * READFROM from data_push_tail:D to data_push_tail:A * thus * READFROM from desc_make_reusable:A to this CPU */ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ /* * Loop until the tail lpos is at or beyond @lpos. This condition * may already be satisfied, resulting in no full memory barrier * from data_push_tail:D being performed. However, since this CPU * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. */ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) { /* * 1. Guarantee the block ID loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * * If data_make_reusable:A reads from data_alloc:B, * then data_push_tail:C reads from * data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to data_alloc:B * matching * RMB from data_make_reusable:A to * data_push_tail:C * * Note: data_push_tail:D and data_alloc:B can be * different CPUs. However, the data_alloc:B * CPU (which performs the full memory * barrier) must have previously seen * data_push_tail:D. * * 2. Guarantee the descriptor state loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled descriptor causing the tail lpos to * have been previously pushed. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If data_make_reusable:B reads from * desc_reserve:F, then data_push_tail:C reads * from data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to desc_reserve:F * matching * RMB from data_make_reusable:B to * data_push_tail:C * * Note: data_push_tail:D and desc_reserve:F can * be different CPUs. However, the * desc_reserve:F CPU (which performs the * full memory barrier) must have previously * seen data_push_tail:D. */ smp_rmb(); /* LMM(data_push_tail:B) */ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos ); /* LMM(data_push_tail:C) */ if (tail_lpos_new == tail_lpos) return false; /* Another CPU pushed the tail. Try again. */ tail_lpos = tail_lpos_new; continue; } /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail lpos. A full * memory barrier is needed since other CPUs may have made * the descriptor states reusable. This pairs with * data_push_tail:A. */ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, next_lpos)) { /* LMM(data_push_tail:D) */ break; } } return true; } /* * Advance the desc ring tail. This function advances the tail by one * descriptor, thus invalidating the oldest descriptor. Before advancing * the tail, the tail descriptor is made reusable and all data blocks up to * and including the descriptor's data block are invalidated (i.e. the data * ring tail is pushed past the data block of the descriptor being made * reusable). */ static bool desc_push_tail(struct printk_ringbuffer *rb, unsigned long tail_id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: /* * If the ID is exactly 1 wrap behind the expected, it is * in the process of being reserved by another writer and * must be considered reserved. */ if (DESC_ID(atomic_long_read(&desc.state_var)) == DESC_ID_PREV_WRAP(desc_ring, tail_id)) { return false; } /* * The ID has changed. Another writer must have pushed the * tail and recycled the descriptor already. Success is * returned because the caller is only interested in the * specified tail being pushed, which it was. */ return true; case desc_reserved: case desc_committed: return false; case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: break; } /* * Data blocks must be invalidated before their associated * descriptor can be made available for recycling. Invalidating * them later is not possible because there is no way to trust * data blocks once their associated descriptor is gone. */ if (!data_push_tail(rb, desc.text_blk_lpos.next)) return false; /* * Check the next descriptor after @tail_id before pushing the tail * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or * equal to @head_id so there is no risk of pushing the tail past the * head. */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows * verifying the recycled descriptor state. A full memory * barrier is needed since other CPUs may have made the * descriptor states reusable. This pairs with desc_reserve:D. */ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ } else { /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail ID in the * case that the descriptor has been recycled. This pairs * with desc_reserve:D. * * Memory barrier involvement: * * If desc_push_tail:A reads from desc_reserve:F, then * desc_push_tail:D reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB from desc_push_tail:A to desc_push_tail:D * * Note: desc_push_tail:B and desc_reserve:F can be different * CPUs. However, the desc_reserve:F CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_push_tail:C) */ /* * Re-check the tail ID. The descriptor following @tail_id is * not in an allowed tail state. But if the tail has since * been moved by another CPU, then it does not matter. */ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ return false; } return true; } /* Reserve a new descriptor, invalidating the oldest if necessary. */ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val; unsigned long id_prev_wrap; struct prb_desc *desc; unsigned long head_id; unsigned long id; head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); /* * Guarantee the head ID is read before reading the tail ID. * Since the tail ID is updated before the head ID, this * guarantees that @id_prev_wrap is never ahead of the tail * ID. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If desc_reserve:A reads from desc_reserve:D, then * desc_reserve:C reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:D * matching * RMB from desc_reserve:A to desc_reserve:C * * Note: desc_push_tail:B and desc_reserve:D can be different * CPUs. However, the desc_reserve:D CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_reserve:B) */ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id )) { /* LMM(desc_reserve:C) */ /* * Make space for the new descriptor by * advancing the tail. */ if (!desc_push_tail(rb, id_prev_wrap)) return false; } /* * 1. Guarantee the tail ID is read before validating the * recycled descriptor state. A read memory barrier is * sufficient for this. This pairs with desc_push_tail:B. * * Memory barrier involvement: * * If desc_reserve:C reads from desc_push_tail:B, then * desc_reserve:E reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to desc_push_tail:B * matching * RMB from desc_reserve:C to desc_reserve:E * * Note: desc_make_reusable:A and desc_push_tail:B can be * different CPUs. However, the desc_push_tail:B CPU * (which performs the full memory barrier) must have * previously seen desc_make_reusable:A. * * 2. Guarantee the tail ID is stored before storing the head * ID. This pairs with desc_reserve:B. * * 3. Guarantee any data ring tail changes are stored before * recycling the descriptor. Data ring tail changes can * happen via desc_push_tail()->data_push_tail(). A full * memory barrier is needed since another CPU may have * pushed the data ring tails. This pairs with * data_push_tail:B. * * 4. Guarantee a new tail ID is stored before recycling the * descriptor. A full memory barrier is needed since * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. * * 5. Guarantee the head ID is stored before trying to * finalize the previous descriptor. This pairs with * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ desc = to_desc(desc_ring, id); /* * If the descriptor has been recycled, verify the old state val. * See "ABA Issues" about why this verification is performed. */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } /* * Assign the descriptor a new ID and set its state to reserved. * See "ABA Issues" about why cmpxchg() instead of set() is used. * * Guarantee the new descriptor ID and state is stored before making * any other changes. A write memory barrier is sufficient for this. * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } /* Now data in @desc can be modified: LMM(desc_reserve:G) */ *id_out = id; return true; } /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) { unsigned long begin_lpos; unsigned long next_lpos; begin_lpos = lpos; next_lpos = lpos + size; /* First check if the data block does not wrap. */ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); } /* * Allocate a new data block, invalidating the oldest data block(s) * if necessary. This function also associates the data block with * a specified descriptor. */ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long begin_lpos; unsigned long next_lpos; if (size == 0) { /* * Data blocks are not created for empty lines. Instead, the * reader will recognize these special lpos values and handle * it appropriately. */ blk_lpos->begin = EMPTY_LINE_LPOS; blk_lpos->next = EMPTY_LINE_LPOS; return NULL; } size = to_blk_size(size); begin_lpos = atomic_long_read(&data_ring->head_lpos); do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; } /* * 1. Guarantee any descriptor states that have transitioned * to reusable are stored before modifying the newly * allocated data area. A full memory barrier is needed * since other CPUs may have made the descriptor states * reusable. See data_push_tail:A about why the reusable * states are visible. This pairs with desc_read:D. * * 2. Guarantee any updated tail lpos is stored before * modifying the newly allocated data area. Another CPU may * be in data_make_reusable() and is reading a block ID * from this area. data_make_reusable() can handle reading * a garbage block ID value, but then it must be able to * load a new tail lpos. A full memory barrier is needed * since other CPUs may have updated the tail lpos. This * pairs with data_push_tail:B. */ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, next_lpos)); /* LMM(data_alloc:A) */ blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; } blk_lpos->begin = begin_lpos; blk_lpos->next = next_lpos; return &blk->data[0]; } /* * Try to resize an existing data block associated with the descriptor * specified by @id. If the resized data block should become wrapped, it * copies the old data to the new data block. If @size yields a data block * with the same or less size, the data block is left as is. * * Fail if this is not the last allocated data block or if there is not * enough space or it is not possible make enough space. * * Return a pointer to the beginning of the entire data buffer or NULL on * failure. */ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long head_lpos; unsigned long next_lpos; bool wrapped; /* Reallocation only works if @blk_lpos is the newest data block. */ head_lpos = atomic_long_read(&data_ring->head_lpos); if (head_lpos != blk_lpos->next) return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { if (wrapped) blk = to_block(data_ring, 0); else blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) return NULL; /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, next_lpos)) { /* LMM(data_realloc:A) */ return NULL; } blk = to_block(data_ring, blk_lpos->begin); if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; if (!wrapped) { /* * Since the allocated space is now in the newly * created wrapping data block, copy the content * from the old data block. */ memcpy(&blk->data[0], &old_blk->data[0], (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); } } blk_lpos->next = next_lpos; return &blk->data[0]; } /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * For wrapping data blocks, the trailing (wasted) space is * also counted. */ return (DATA_INDEX(data_ring, blk_lpos->next) + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * Given @blk_lpos, return a pointer to the writer data from the data block * and calculate the size of the data part. A NULL pointer is returned if * @blk_lpos specifies values that could never be legal. * * This function (used by readers) performs strict validation on the lpos * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ if (BLK_DATALESS(blk_lpos)) { /* * Records that are just empty lines are also valid, even * though they do not have a data block. For such records * explicitly return empty string data to signify success. */ if (blk_lpos->begin == EMPTY_LINE_LPOS && blk_lpos->next == EMPTY_LINE_LPOS) { *data_size = 0; return ""; } /* Data lost, invalid, or otherwise unavailable. */ return NULL; } /* Regular data block: @begin less than @next and in same wrap. */ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && blk_lpos->begin < blk_lpos->next) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == DATA_WRAPS(data_ring, blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); /* Illegal block description. */ } else { WARN_ON_ONCE(1); return NULL; } /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { return NULL; } /* A valid data block will always have at least an ID. */ if (WARN_ON_ONCE(*data_size < sizeof(db->id))) return NULL; /* Subtract block ID space from size to reflect data size. */ *data_size -= sizeof(db->id); return &db->data[0]; } /* * Attempt to transition the newest descriptor from committed back to reserved * so that the record can be modified by a writer again. This is only possible * if the descriptor is not yet finalized and the provided @caller_id matches. */ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, u32 caller_id, unsigned long *id_out) { unsigned long prev_state_val; enum desc_state d_state; struct prb_desc desc; struct prb_desc *d; unsigned long id; u32 cid; id = atomic_long_read(&desc_ring->head_id); /* * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ d_state = desc_read(desc_ring, id, &desc, NULL, &cid); if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); prev_state_val = DESC_SV(id, desc_committed); /* * Guarantee the reserved state is stored before reading any * record data. A full memory barrier is needed because @state_var * modification is followed by reading. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_reopen_last:A reads from _prb_commit:B, then * prb_reserve_in_last:A reads from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * MB If desc_reopen_last:A to prb_reserve_in_last:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ return NULL; } *id_out = id; return d; } /** * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer * used by the newest record. * * @e: The entry structure to setup. * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. * * The writer specifies the text size to extend (not the new total size) by * setting the @text_buf_size field of @r. To ensure proper initialization * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using * prb_reserve(). * * Context: Any context. Disables local interrupts on success. * Return: true if text data could be extended, otherwise false. * * On success: * * - @r->text_buf points to the beginning of the entire text buffer. * * - @r->text_buf_size is set to the new total size of the buffer. * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * * - prb_record_text_space() can be used on @e to query the new * actually used space. * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; } /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ info = to_info(desc_ring, id); /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. */ e->rb = rb; e->id = id; /* * desc_reopen_last() checked the caller_id, but there was no * exclusive access at that point. The descriptor may have * changed since then. */ if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", info->text_len); info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } else { if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) goto fail; /* * Increase the buffer size to include the original size. If * the meta data (@text_len) is not sane, use the full data * block size. */ if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", info->text_len, data_size); info->text_len = data_size; } r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_realloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } if (r->text_buf_size && !r->text_buf) goto fail; r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: prb_commit(e); /* prb_commit() re-enabled interrupts. */ fail_reopen: /* Make it clear to the caller that the re-reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* * @last_finalized_seq value guarantees that all records up to and including * this sequence number are finalized and can be read. The only exception are * too old records which have already been overwritten. * * It is also guaranteed that @last_finalized_seq only increases. * * Be aware that finalized records following non-finalized records are not * reported because they are not yet available to the reader. For example, * a new record stored via printk() will not be available to a printer if * it follows a record that has not been finalized yet. However, once that * non-finalized record becomes finalized, @last_finalized_seq will be * appropriately updated and the full set of finalized records will be * available to the printer. And since each printk() caller will either * directly print or trigger deferred printing of all available unprinted * records, all printk() messages will get printed. */ static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long ulseq; /* * Guarantee the sequence number is loaded before loading the * associated record in order to guarantee that the record can be * seen by this CPU. This pairs with desc_update_last_finalized:A. */ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq ); /* LMM(desc_last_finalized_seq:A) */ return __ulseq_to_u64seq(rb, ulseq); } static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count); /* * Check if there are records directly following @last_finalized_seq that are * finalized. If so, update @last_finalized_seq to the latest of these * records. It is not allowed to skip over records that are not yet finalized. */ static void desc_update_last_finalized(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; u64 old_seq = desc_last_finalized_seq(rb); unsigned long oldval; unsigned long newval; u64 finalized_seq; u64 try_seq; try_again: finalized_seq = old_seq; try_seq = finalized_seq + 1; /* Try to find later finalized records. */ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { finalized_seq = try_seq; try_seq++; } /* No update needed if no later finalized record was found. */ if (finalized_seq == old_seq) return; oldval = __u64seq_to_ulseq(old_seq); newval = __u64seq_to_ulseq(finalized_seq); /* * Set the sequence number of a later finalized record that has been * seen. * * Guarantee the record data is visible to other CPUs before storing * its sequence number. This pairs with desc_last_finalized_seq:A. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then desc_read:A reads from * _prb_commit:B. * * Relies on: * * RELEASE from _prb_commit:B to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to desc_read:A * * Note: _prb_commit:B and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A * CPU (which performs the release) must have previously seen * _prb_commit:B. */ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ old_seq = __ulseq_to_u64seq(rb, oldval); goto try_again; } } /* * Attempt to finalize a specified descriptor. If this fails, the descriptor * is either already final or it will finalize itself when the writer commits. */ static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val = DESC_SV(id, desc_committed); struct prb_desc *d = to_desc(desc_ring, id); if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ desc_update_last_finalized(rb); } } /** * prb_reserve() - Reserve space in the ringbuffer. * * @e: The entry structure to setup. * @rb: The ringbuffer to reserve data in. * @r: The record structure to allocate buffers for. * * This is the public function available to writers to reserve data. * * The writer specifies the text size to reserve by setting the * @text_buf_size field of @r. To ensure proper initialization of @r, * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * * On success, the fields @info and @text_buf of @r will be set by this * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * * Important: @info->text_len needs to be set correctly by the writer in * order for data to be readable and/or extended. Its value * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable * interrupts during the reserve/commit window in order to minimize * the likelihood of this happening. */ local_irq_save(e->irqflags); if (!desc_reserve(rb, &id)) { /* Descriptor reservation failures are tracked. */ atomic_long_inc(&rb->fail); local_irq_restore(e->irqflags); goto fail; } d = to_desc(desc_ring, id); info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ seq = info->seq; memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. */ e->rb = rb; e->id = id; /* * Initialize the sequence number if it has "never been set". * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) info->seq = DESC_INDEX(desc_ring, id); else info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous * descriptors are no longer able to be extended. Finalize the * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ if (info->seq > 0) desc_make_final(rb, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; } r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: /* Make it clear to the caller that the reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * * 1 Guarantee all record data is stored before the descriptor state * is stored as committed. A write memory barrier is sufficient * for this. This pairs with desc_read:B and desc_reopen_last:A. * * 2. Guarantee the descriptor state is stored as committed before * re-checking the head ID in order to possibly finalize this * descriptor. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If prb_commit:A reads from desc_reserve:D, then * desc_make_final:A reads from _prb_commit:B. * * Relies on: * * MB _prb_commit:B to prb_commit:A * matching * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } /* Restore interrupts, the reserve/commit window is finished. */ local_irq_restore(e->irqflags); } /** * prb_commit() - Commit (previously reserved) data to the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit data. * * Note that the data is not yet available to readers until it is finalized. * Finalizing happens automatically when space for the next record is * reserved. * * See prb_final_commit() for a version of this function that finalizes * immediately. * * Context: Any context. Enables local interrupts. */ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; unsigned long head_id; _prb_commit(e, desc_committed); /* * If this descriptor is no longer the head (i.e. a new record has * been allocated), extending the data for this record is no longer * allowed and therefore it must be finalized. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ if (head_id != e->id) desc_make_final(e->rb, e->id); } /** * prb_final_commit() - Commit and finalize (previously reserved) data to * the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit+finalize data. * * By finalizing, the data is made immediately available to readers. * * This function should only be used if there are no intentions of extending * this data using prb_reserve_in_last(). * * Context: Any context. Enables local interrupts. */ void prb_final_commit(struct prb_reserved_entry *e) { _prb_commit(e, desc_finalized); desc_update_last_finalized(e->rb); } /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); if (!next) break; line_count++; next++; next_size = text_size - (next - text); } return line_count; } /* * Given @blk_lpos, copy an expected @len of data into the provided buffer. * If @line_count is provided, count the number of lines in the data. * * This function (used by readers) performs strict validation on the data * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static bool copy_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) return true; data = get_data(data_ring, blk_lpos, &data_size); if (!data) return false; /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. * * Note that invalid @len values can occur because the caller loads * the value during an allowed data race. */ if (data_size < (unsigned int)len) return false; /* Caller interested in the line count? */ if (line_count) *line_count = count_lines(data, len); /* Caller interested in the data content? */ if (!buf || !buf_size) return true; data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } /* * This is an extended version of desc_read(). It gets a copy of a specified * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: * -EINVAL: A finalized record with sequence number @seq does not exist. * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; u64 s; d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record * does not exist. A descriptor in the reserved or committed state * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || s != seq) { return -EINVAL; } /* * A descriptor in the reusable state may no longer have its data * available; report it as existing but with lost data. Or the record * may actually be a record with lost data. */ if (d_state == desc_reusable || (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } return 0; } /* * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; unsigned long id; int err; /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability * of the record. */ if (err || !r) return err; /* If requested, copy meta data. */ if (r->info) memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ u64 prb_first_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; unsigned long id; u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is * _always_ in the finalized or reusable state. */ if (d_state == desc_finalized || d_state == desc_reusable) break; /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail in the case * that the descriptor has been recycled. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If prb_first_seq:B reads from desc_reserve:F, then * prb_first_seq:A reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB prb_first_seq:B to prb_first_seq:A */ smp_rmb(); /* LMM(prb_first_seq:C) */ } return seq; } /** * prb_next_reserve_seq() - Get the sequence number after the most recently * reserved record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what sequence * number will be assigned to the next reserved record. * * Note that depending on the situation, this value can be equal to or * higher than the sequence number returned by prb_next_seq(). * * Context: Any context. * Return: The sequence number that will be assigned to the next record * reserved. */ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long last_finalized_id; atomic_long_t *state_var; u64 last_finalized_seq; unsigned long head_id; struct prb_desc desc; unsigned long diff; struct prb_desc *d; int err; /* * It may not be possible to read a sequence number for @head_id. * So the ID of @last_finailzed_seq is used to calculate what the * sequence number of @head_id will be. */ try_again: last_finalized_seq = desc_last_finalized_seq(rb); /* * @head_id is loaded after @last_finalized_seq to ensure that * it points to the record with @last_finalized_seq or newer. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then * prb_next_reserve_seq:A reads from desc_reserve:D. * * Relies on: * * RELEASE from desc_reserve:D to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A * * Note: desc_reserve:D and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A CPU * (which performs the release) must have previously seen * desc_read:C, which implies desc_reserve:D can be seen. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ d = to_desc(desc_ring, last_finalized_seq); state_var = &d->state_var; /* Extract the ID, used to specify the descriptor to read. */ last_finalized_id = DESC_ID(atomic_long_read(state_var)); /* Ensure @last_finalized_id is correct. */ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); if (err == -EINVAL) { if (last_finalized_seq == 0) { /* * No record has been finalized or even reserved yet. * * The @head_id is initialized such that the first * increment will yield the first record (seq=0). * Handle it separately to avoid a negative @diff * below. */ if (head_id == DESC0_ID(desc_ring->count_bits)) return 0; /* * One or more descriptors are already reserved. Use * the descriptor ID of the first one (@seq=0) for * the @diff below. */ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; } else { /* Record must have been overwritten. Try again. */ goto try_again; } } /* Diff of known descriptor IDs to compute related sequence numbers. */ diff = head_id - last_finalized_id; /* * @head_id points to the most recently reserved record, but this * function returns the sequence number that will be assigned to the * next (not yet reserved) record. Thus +1 is needed. */ return (last_finalized_seq + diff + 1); } /* * Non-blocking read of a record. * * On success @seq is updated to the record that was read and (if provided) * @r and @line_count will contain the read/calculated data. * * On failure @seq is updated to a record that is not yet available to the * reader, but it will be the next record available to the reader. * * Note: When the current CPU is in panic, this function will skip over any * non-existent/non-finalized records in order to allow the panic CPU * to print any and all records that have been finalized. */ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count) { u64 tail_seq; int err; while ((err = prb_read(rb, *seq, r, line_count))) { tail_seq = prb_first_seq(rb); if (*seq < tail_seq) { /* * Behind the tail. Catch up and try again. This * can happen for -ENOENT and -EINVAL cases. */ *seq = tail_seq; } else if (err == -ENOENT) { /* Record exists, but the data was lost. Skip. */ (*seq)++; } else { /* * Non-existent/non-finalized record. Must stop. * * For panic situations it cannot be expected that * non-finalized records will become finalized. But * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this * non-existent/non-finalized record unless it is * at or beyond the head, in which case it is not * possible to continue. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception * might be the last message without trailing newline. * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) (*seq)++; else return false; } } return true; } /** * prb_read_valid() - Non-blocking read of a requested record or (if gone) * the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @r: A record data buffer to store the read record to. * * This is the public function available to readers to read a record. * * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. * * Context: Any context. * Return: true if a record was read, otherwise false. * * On success, the reader must check r->info.seq to see which record was * actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r) { return _prb_read_valid(rb, &seq, r, NULL); } /** * prb_read_valid_info() - Non-blocking read of meta data for a requested * record or (if gone) the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @info: A buffer to store the read record meta data to. * @line_count: A buffer to store the number of lines in the record text. * * This is the public function available to readers to read only the * meta data of a record. * * The reader provides the @info, @line_count buffers to be filled in. * Either of the buffer pointers can be set to NULL if the reader is not * interested in that data. * * Context: Any context. * Return: true if a record's meta data was read, otherwise false. * * On success, the reader must check info->seq to see which record meta data * was actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count) { struct printk_record r; prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } /** * prb_first_valid_seq() - Get the sequence number of the oldest available * record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the * first/oldest valid sequence number is. * * This provides readers a starting point to begin iterating the ringbuffer. * * Context: Any context. * Return: The sequence number of the first/oldest record or, if the * ringbuffer is empty, 0 is returned. */ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) { u64 seq = 0; if (!_prb_read_valid(rb, &seq, NULL, NULL)) return 0; return seq; } /** * prb_next_seq() - Get the sequence number after the last available record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the next * newest sequence number available to readers will be. * * This provides readers a sequence number to jump to if all currently * available records should be skipped. It is guaranteed that all records * previous to the returned value have been finalized and are (or were) * available to the reader. * * Context: Any context. * Return: The sequence number of the next newest (not yet available) record * for readers. */ u64 prb_next_seq(struct printk_ringbuffer *rb) { u64 seq; seq = desc_last_finalized_seq(rb); /* * Begin searching after the last finalized record. * * On 0, the search must begin at 0 because of hack#2 * of the bootstrapping phase it is not known if a * record at index 0 exists. */ if (seq != 0) seq++; /* * The information about the last finalized @seq might be inaccurate. * Search forward to find the current one. */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; return seq; } /** * prb_init() - Initialize a ringbuffer to use provided external buffers. * * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. * * This must match the initialization of DEFINE_PRINTKRB(). * * Context: Any context. */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } /** * prb_record_text_space() - Query the full actual used ringbuffer space for * the text data of a reserved entry. * * @e: The successfully reserved entry to query. * * This is the public function available to writers to see how much actual * space is used in the ringbuffer to store the text data of the specified * entry. * * This function is only valid if @e has been successfully reserved using * prb_reserve(). * * Context: Any context. * Return: The size in bytes used by the text data of the associated record. */ unsigned int prb_record_text_space(struct prb_reserved_entry *e) { return e->text_space; } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 | // SPDX-License-Identifier: GPL-2.0-only /* * VFIO core * * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> * * Derived from original vfio: * Copyright 2010 Cisco Systems, Inc. All rights reserved. * Author: Tom Lyon, pugs@cisco.com */ #include <linux/vfio.h> #include <linux/iommufd.h> #include <linux/anon_inodes.h> #include "vfio.h" static struct vfio { struct class *class; struct list_head group_list; struct mutex group_lock; /* locks group_list */ struct ida group_ida; dev_t group_devt; } vfio; static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, char *buf) { struct vfio_device *it, *device = ERR_PTR(-ENODEV); mutex_lock(&group->device_lock); list_for_each_entry(it, &group->device_list, group_next) { int ret; if (it->ops->match) { ret = it->ops->match(it, buf); if (ret < 0) { device = ERR_PTR(ret); break; } } else { ret = !strcmp(dev_name(it->dev), buf); } if (ret && vfio_device_try_get_registration(it)) { device = it; break; } } mutex_unlock(&group->device_lock); return device; } /* * VFIO Group fd, /dev/vfio/$GROUP */ static bool vfio_group_has_iommu(struct vfio_group *group) { lockdep_assert_held(&group->group_lock); /* * There can only be users if there is a container, and if there is a * container there must be users. */ WARN_ON(!group->container != !group->container_users); return group->container || group->iommufd; } /* * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or * if there was no container to unset. Since the ioctl is called on * the group, we know that still exists, therefore the only valid * transition here is 1->0. */ static int vfio_group_ioctl_unset_container(struct vfio_group *group) { int ret = 0; mutex_lock(&group->group_lock); if (!vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } if (group->container) { if (group->container_users != 1) { ret = -EBUSY; goto out_unlock; } vfio_group_detach_container(group); } if (group->iommufd) { iommufd_ctx_put(group->iommufd); group->iommufd = NULL; } out_unlock: mutex_unlock(&group->group_lock); return ret; } static int vfio_group_ioctl_set_container(struct vfio_group *group, int __user *arg) { struct vfio_container *container; struct iommufd_ctx *iommufd; int ret; int fd; if (get_user(fd, arg)) return -EFAULT; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; mutex_lock(&group->group_lock); if (vfio_group_has_iommu(group)) { ret = -EINVAL; goto out_unlock; } if (!group->iommu_group) { ret = -ENODEV; goto out_unlock; } container = vfio_container_from_file(fd_file(f)); if (container) { ret = vfio_container_attach_group(container, group); goto out_unlock; } iommufd = iommufd_ctx_from_file(fd_file(f)); if (!IS_ERR(iommufd)) { if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && group->type == VFIO_NO_IOMMU) ret = iommufd_vfio_compat_set_no_iommu(iommufd); else ret = iommufd_vfio_compat_ioas_create(iommufd); if (ret) { iommufd_ctx_put(iommufd); goto out_unlock; } group->iommufd = iommufd; goto out_unlock; } /* The FD passed is not recognized. */ ret = -EBADFD; out_unlock: mutex_unlock(&group->group_lock); return ret; } static void vfio_device_group_get_kvm_safe(struct vfio_device *device) { spin_lock(&device->group->kvm_ref_lock); vfio_device_get_kvm_safe(device, device->group->kvm); spin_unlock(&device->group->kvm_ref_lock); } static int vfio_df_group_open(struct vfio_device_file *df) { struct vfio_device *device = df->device; int ret; mutex_lock(&device->group->group_lock); if (!vfio_group_has_iommu(device->group)) { ret = -EINVAL; goto out_unlock; } mutex_lock(&device->dev_set->lock); /* * Before the first device open, get the KVM pointer currently * associated with the group (if there is one) and obtain a reference * now that will be held until the open_count reaches 0 again. Save * the pointer in the device for use by drivers. */ if (device->open_count == 0) vfio_device_group_get_kvm_safe(device); df->iommufd = device->group->iommufd; if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) { /* * Require no compat ioas to be assigned to proceed. The basic * statement is that the user cannot have done something that * implies they expected translation to exist */ if (!capable(CAP_SYS_RAWIO) || vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) ret = -EPERM; else ret = 0; goto out_put_kvm; } ret = vfio_df_open(df); if (ret) goto out_put_kvm; if (df->iommufd && device->open_count == 1) { ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd); if (ret) goto out_close_device; } /* * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ * read/write/mmap and vfio_file_has_device_access() */ smp_store_release(&df->access_granted, true); mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); return 0; out_close_device: vfio_df_close(df); out_put_kvm: df->iommufd = NULL; if (device->open_count == 0) vfio_device_put_kvm(device); mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); return ret; } void vfio_df_group_close(struct vfio_device_file *df) { struct vfio_device *device = df->device; mutex_lock(&device->group->group_lock); mutex_lock(&device->dev_set->lock); vfio_df_close(df); df->iommufd = NULL; if (device->open_count == 0) vfio_device_put_kvm(device); mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); } static struct file *vfio_device_open_file(struct vfio_device *device) { struct vfio_device_file *df; struct file *filep; int ret; df = vfio_allocate_device_file(device); if (IS_ERR(df)) { ret = PTR_ERR(df); goto err_out; } df->group = device->group; ret = vfio_df_group_open(df); if (ret) goto err_free; /* * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls */ filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, df, O_RDWR); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; } /* * TODO: add an anon_inode interface to do this. * Appears to be missing by lack of need rather than * explicitly prevented. Now there's need. */ filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); /* * Use the pseudo fs inode on the device to link all mmaps * to the same address space, allowing us to unmap all vmas * associated to this device using unmap_mapping_range(). */ filep->f_mapping = device->inode->i_mapping; if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); /* * On success the ref of device is moved to the file and * put in vfio_device_fops_release() */ return filep; err_close_device: vfio_df_group_close(df); err_free: kfree(df); err_out: return ERR_PTR(ret); } static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, char __user *arg) { struct vfio_device *device; struct file *filep; char *buf; int fdno; int ret; buf = strndup_user(arg, PAGE_SIZE); if (IS_ERR(buf)) return PTR_ERR(buf); device = vfio_device_get_from_name(group, buf); kfree(buf); if (IS_ERR(device)) return PTR_ERR(device); fdno = get_unused_fd_flags(O_CLOEXEC); if (fdno < 0) { ret = fdno; goto err_put_device; } filep = vfio_device_open_file(device); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_put_fdno; } fd_install(fdno, filep); return fdno; err_put_fdno: put_unused_fd(fdno); err_put_device: vfio_device_put_registration(device); return ret; } static int vfio_group_ioctl_get_status(struct vfio_group *group, struct vfio_group_status __user *arg) { unsigned long minsz = offsetofend(struct vfio_group_status, flags); struct vfio_group_status status; if (copy_from_user(&status, arg, minsz)) return -EFAULT; if (status.argsz < minsz) return -EINVAL; status.flags = 0; mutex_lock(&group->group_lock); if (!group->iommu_group) { mutex_unlock(&group->group_lock); return -ENODEV; } /* * With the container FD the iommu_group_claim_dma_owner() is done * during SET_CONTAINER but for IOMMFD this is done during * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due * to viability. */ if (vfio_group_has_iommu(group)) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; mutex_unlock(&group->group_lock); if (copy_to_user(arg, &status, minsz)) return -EFAULT; return 0; } static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; void __user *uarg = (void __user *)arg; switch (cmd) { case VFIO_GROUP_GET_DEVICE_FD: return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: return vfio_group_ioctl_get_status(group, uarg); case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: return vfio_group_ioctl_unset_container(group); default: return -ENOTTY; } } int vfio_device_block_group(struct vfio_device *device) { struct vfio_group *group = device->group; int ret = 0; mutex_lock(&group->group_lock); if (group->opened_file) { ret = -EBUSY; goto out_unlock; } group->cdev_device_open_cnt++; out_unlock: mutex_unlock(&group->group_lock); return ret; } void vfio_device_unblock_group(struct vfio_device *device) { struct vfio_group *group = device->group; mutex_lock(&group->group_lock); group->cdev_device_open_cnt--; mutex_unlock(&group->group_lock); } static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group = container_of(inode->i_cdev, struct vfio_group, cdev); int ret; mutex_lock(&group->group_lock); /* * drivers can be zero if this races with vfio_device_remove_group(), it * will be stable at 0 under the group rwsem */ if (refcount_read(&group->drivers) == 0) { ret = -ENODEV; goto out_unlock; } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { ret = -EPERM; goto out_unlock; } if (group->cdev_device_open_cnt) { ret = -EBUSY; goto out_unlock; } /* * Do we need multiple instances of the group open? Seems not. */ if (group->opened_file) { ret = -EBUSY; goto out_unlock; } group->opened_file = filep; filep->private_data = group; ret = 0; out_unlock: mutex_unlock(&group->group_lock); return ret; } static int vfio_group_fops_release(struct inode *inode, struct file *filep) { struct vfio_group *group = filep->private_data; filep->private_data = NULL; mutex_lock(&group->group_lock); /* * Device FDs hold a group file reference, therefore the group release * is only called when there are no open devices. */ WARN_ON(group->notifier.head); if (group->container) vfio_group_detach_container(group); if (group->iommufd) { iommufd_ctx_put(group->iommufd); group->iommufd = NULL; } group->opened_file = NULL; mutex_unlock(&group->group_lock); return 0; } static const struct file_operations vfio_group_fops = { .owner = THIS_MODULE, .unlocked_ioctl = vfio_group_fops_unl_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vfio_group_fops_open, .release = vfio_group_fops_release, }; /* * Group objects - create, release, get, put, search */ static struct vfio_group * vfio_group_find_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; lockdep_assert_held(&vfio.group_lock); /* * group->iommu_group from the vfio.group_list cannot be NULL * under the vfio.group_lock. */ list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) return group; } return NULL; } static void vfio_group_release(struct device *dev) { struct vfio_group *group = container_of(dev, struct vfio_group, dev); mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); WARN_ON(group->iommu_group); WARN_ON(group->cdev_device_open_cnt); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_group *group; int minor; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); if (minor < 0) { kfree(group); return ERR_PTR(minor); } device_initialize(&group->dev); group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); group->dev.class = vfio.class; group->dev.release = vfio_group_release; cdev_init(&group->cdev, &vfio_group_fops); group->cdev.owner = THIS_MODULE; refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); spin_lock_init(&group->kvm_ref_lock); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); group->type = type; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); return group; } static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_group *group; struct vfio_group *ret; int err; lockdep_assert_held(&vfio.group_lock); group = vfio_group_alloc(iommu_group, type); if (IS_ERR(group)) return group; err = dev_set_name(&group->dev, "%s%d", group->type == VFIO_NO_IOMMU ? "noiommu-" : "", iommu_group_id(iommu_group)); if (err) { ret = ERR_PTR(err); goto err_put; } err = cdev_device_add(&group->cdev, &group->dev); if (err) { ret = ERR_PTR(err); goto err_put; } list_add(&group->vfio_next, &vfio.group_list); return group; err_put: put_device(&group->dev); return ret; } static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { struct iommu_group *iommu_group; struct vfio_group *group; int ret; iommu_group = iommu_group_alloc(); if (IS_ERR(iommu_group)) return ERR_CAST(iommu_group); ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); if (ret) goto out_put_group; ret = iommu_group_add_device(iommu_group, dev); if (ret) goto out_put_group; mutex_lock(&vfio.group_lock); group = vfio_create_group(iommu_group, type); mutex_unlock(&vfio.group_lock); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_remove_device; } iommu_group_put(iommu_group); return group; out_remove_device: iommu_group_remove_device(dev); out_put_group: iommu_group_put(iommu_group); return ERR_PTR(ret); } static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) { struct vfio_device *device; mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { if (device->dev == dev) { mutex_unlock(&group->device_lock); return true; } } mutex_unlock(&group->device_lock); return false; } static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) { struct iommu_group *iommu_group; struct vfio_group *group; iommu_group = iommu_group_get(dev); if (!iommu_group && vfio_noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that * don't already have one, implying no IOMMU hardware/driver * exists. Taint the kernel because we're about to give a DMA * capable device to a user without IOMMU protection. */ group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); if (!IS_ERR(group)) { add_taint(TAINT_USER, LOCKDEP_STILL_OK); dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); } return group; } if (!iommu_group) return ERR_PTR(-EINVAL); mutex_lock(&vfio.group_lock); group = vfio_group_find_from_iommu(iommu_group); if (group) { if (WARN_ON(vfio_group_has_device(group, dev))) group = ERR_PTR(-EINVAL); else refcount_inc(&group->drivers); } else { group = vfio_create_group(iommu_group, VFIO_IOMMU); } mutex_unlock(&vfio.group_lock); /* The vfio_group holds a reference to the iommu_group */ iommu_group_put(iommu_group); return group; } int vfio_device_set_group(struct vfio_device *device, enum vfio_group_type type) { struct vfio_group *group; if (type == VFIO_IOMMU) group = vfio_group_find_or_alloc(device->dev); else group = vfio_noiommu_group_alloc(device->dev, type); if (IS_ERR(group)) return PTR_ERR(group); /* Our reference on group is moved to the device */ device->group = group; return 0; } void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; struct iommu_group *iommu_group; if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) return; list_del(&group->vfio_next); /* * We could concurrently probe another driver in the group that might * race vfio_device_remove_group() with vfio_get_group(), so we have to * ensure that the sysfs is all cleaned up under lock otherwise the * cdev_device_add() will fail due to the name aready existing. */ cdev_device_del(&group->cdev, &group->dev); mutex_lock(&group->group_lock); /* * These data structures all have paired operations that can only be * undone when the caller holds a live reference on the device. Since * all pairs must be undone these WARN_ON's indicate some caller did not * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); WARN_ON(group->notifier.head); /* * Revoke all users of group->iommu_group. At this point we know there * are no devices active because we are unplugging the last one. Setting * iommu_group to NULL blocks all new users. */ if (group->container) vfio_group_detach_container(group); iommu_group = group->iommu_group; group->iommu_group = NULL; mutex_unlock(&group->group_lock); mutex_unlock(&vfio.group_lock); iommu_group_put(iommu_group); put_device(&group->dev); } void vfio_device_group_register(struct vfio_device *device) { mutex_lock(&device->group->device_lock); list_add(&device->group_next, &device->group->device_list); mutex_unlock(&device->group->device_lock); } void vfio_device_group_unregister(struct vfio_device *device) { mutex_lock(&device->group->device_lock); list_del(&device->group_next); mutex_unlock(&device->group->device_lock); } int vfio_device_group_use_iommu(struct vfio_device *device) { struct vfio_group *group = device->group; int ret = 0; lockdep_assert_held(&group->group_lock); if (WARN_ON(!group->container)) return -EINVAL; ret = vfio_group_use_container(group); if (ret) return ret; vfio_device_container_register(device); return 0; } void vfio_device_group_unuse_iommu(struct vfio_device *device) { struct vfio_group *group = device->group; lockdep_assert_held(&group->group_lock); if (WARN_ON(!group->container)) return; vfio_device_container_unregister(device); vfio_group_unuse_container(group); } bool vfio_device_has_container(struct vfio_device *device) { return device->group->container; } struct vfio_group *vfio_group_from_file(struct file *file) { struct vfio_group *group = file->private_data; if (file->f_op != &vfio_group_fops) return NULL; return group; } /** * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file * * The returned iommu_group is valid as long as a ref is held on the file. This * returns a reference on the group. This function is deprecated, only the SPAPR * path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = vfio_group_from_file(file); struct iommu_group *iommu_group = NULL; if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) return NULL; if (!group) return NULL; mutex_lock(&group->group_lock); if (group->iommu_group) { iommu_group = group->iommu_group; iommu_group_ref_get(iommu_group); } mutex_unlock(&group->group_lock); return iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); /** * vfio_file_is_group - True if the file is a vfio group file * @file: VFIO group file */ bool vfio_file_is_group(struct file *file) { return vfio_group_from_file(file); } EXPORT_SYMBOL_GPL(vfio_file_is_group); bool vfio_group_enforced_coherent(struct vfio_group *group) { struct vfio_device *device; bool ret = true; /* * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then * any domain later attached to it will also not support it. If the cap * is set then the iommu_domain eventually attached to the device/group * must use a domain with enforce_cache_coherency(). */ mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { if (!device_iommu_capable(device->dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { ret = false; break; } } mutex_unlock(&group->device_lock); return ret; } void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) { spin_lock(&group->kvm_ref_lock); group->kvm = kvm; spin_unlock(&group->kvm_ref_lock); } /** * vfio_file_has_dev - True if the VFIO file is a handle for device * @file: VFIO file to check * @device: Device that must be part of the file * * Returns true if given file has permission to manipulate the given device. */ bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { struct vfio_group *group = vfio_group_from_file(file); if (!group) return false; return group == device->group; } EXPORT_SYMBOL_GPL(vfio_file_has_dev); static char *vfio_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } int __init vfio_group_init(void) { int ret; ida_init(&vfio.group_ida); mutex_init(&vfio.group_lock); INIT_LIST_HEAD(&vfio.group_list); ret = vfio_container_init(); if (ret) return ret; /* /dev/vfio/$GROUP */ vfio.class = class_create("vfio"); if (IS_ERR(vfio.class)) { ret = PTR_ERR(vfio.class); goto err_group_class; } vfio.class->devnode = vfio_devnode; ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) goto err_alloc_chrdev; return 0; err_alloc_chrdev: class_destroy(vfio.class); vfio.class = NULL; err_group_class: vfio_container_cleanup(); return ret; } void vfio_group_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.class); vfio.class = NULL; vfio_container_cleanup(); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ #define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __always_inline unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } #endif |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 19 1 17 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 2 17 2 17 18 19 19 18 18 19 19 19 19 19 19 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 24 24 25 24 24 2 9 9 9 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/sched/mm.h> #include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> #include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> #include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> #include <linux/phy_link_topology.h> #include "dev.h" #include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } #ifndef CONFIG_PREEMPT_RT static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); static int __init setup_backlog_napi_threads(char *arg) { static_branch_enable(&use_backlog_threads_key); return 0; } early_param("thread_backlog_napi", setup_backlog_napi_threads); static bool use_backlog_threads(void) { return static_branch_unlikely(&use_backlog_threads_key); } #else static bool use_backlog_threads(void) { return true; } #endif static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else local_irq_save(*flags); } static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else local_irq_disable(); } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else local_irq_restore(*flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); name_node->dev = dev; name_node->name = name; return name_node; } static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { struct netdev_name_node *name_node; name_node = netdev_name_node_alloc(dev, dev->name); if (!name_node) return NULL; INIT_LIST_HEAD(&name_node->list); return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) { kfree(name_node); } static void netdev_name_node_add(struct net *net, struct netdev_name_node *name_node) { hlist_add_head_rcu(&name_node->hlist, dev_name_hash(net, name_node->name)); } static void netdev_name_node_del(struct netdev_name_node *name_node) { hlist_del_rcu(&name_node->hlist); } static struct netdev_name_node *netdev_name_node_lookup(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry_rcu(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } bool netdev_name_in_use(struct net *net, const char *name) { return netdev_name_node_lookup(net, name); } EXPORT_SYMBOL(netdev_name_in_use); int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST; name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } static void netdev_name_node_alt_free(struct rcu_head *head) { struct netdev_name_node *name_node = container_of(head, struct netdev_name_node, rcu); kfree(name_node->name); netdev_name_node_free(name_node); } static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); list_del(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device. */ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; __netdev_name_node_alt_destroy(name_node); return 0; } static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { list_del(&name_node->list); netdev_name_node_alt_free(&name_node->rcu); } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); /* We reserved the ifindex, this can't fail */ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); xa_erase(&net->dev_by_index, dev->ifindex); netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), }; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) return NULL; return &stack->path[k]; } int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack) { const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, }; struct net_device_path *path; int ret = 0; memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; path = dev_fwd_path(stack); if (!path) return -1; memset(path, 0, sizeof(struct net_device_path)); ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); if (ret < 0) return -1; if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } if (!ctx.dev) return ret; path = dev_fwd_path(stack); if (!path) return -1; path->type = DEV_PATH_ETHERNET; path->dev = ctx.dev; return ret; } EXPORT_SYMBOL_GPL(dev_fill_forward_path); /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct * netdev_napi_by_id(struct net *net, unsigned int napi_id) { struct napi_struct *napi; napi = napi_by_id(napi_id); if (!napi) return NULL; if (WARN_ON_ONCE(!napi->dev)) return NULL; if (!net_eq(net, dev_net(napi->dev))) return NULL; return napi; } /** * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it * @net: the applicable net namespace * @napi_id: ID of a NAPI of a target device * * Find a NAPI instance with @napi_id. Lock its device. * The device must be in %NETREG_REGISTERED state for lookup to succeed. * netdev_unlock() must be called to release it. * * Return: pointer to NAPI, its device with lock held, NULL if not found. */ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { struct napi_struct *napi; struct net_device *dev; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { rcu_read_unlock(); return NULL; } dev = napi->dev; dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (!dev) return NULL; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (napi && napi->dev != dev) napi = NULL; rcu_read_unlock(); if (!napi) netdev_unlock(dev); return napi; } /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup_rcu(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /* Deprecated for new users, call netdev_get_by_name() instead */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_name(net, name); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it. */ struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (napi_id < MIN_NAPI_ID) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } /* Release the held reference on the net_device, and if the net_device * is still registered try to lock the instance lock. If device is being * unregistered NULL will be returned (but the reference has been released, * either way!) * * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ struct net_device *__netdev_put_lock(struct net_device *dev) { netdev_lock(dev); if (dev->reg_state > NETREG_REGISTERED) { netdev_unlock(dev); dev_put(dev); return NULL; } dev_put(dev); return dev; } /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. If a valid device * with @ifindex is found it will be returned with netdev->lock held. * netdev_unlock() must be called to release it. * * Return: pointer to a device with lock held, NULL if not found. */ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (!dev) return NULL; return __netdev_put_lock(dev); } struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) { if (dev) netdev_unlock(dev); do { rcu_read_lock(); dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) { rcu_read_unlock(); return NULL; } dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (dev) return dev; (*index)++; } while (true); } static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) { unsigned int seq; do { seq = read_seqbegin(&netdev_rename_lock); strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); return ret; } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; char buf[IFNAMSIZ]; /* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters. */ p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { struct netdev_name_node *name_node; netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); bitmap_free(inuse); if (i == max_netdevices) return -ENFILE; /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ strscpy(buf, name, IFNAMSIZ); snprintf(res, IFNAMSIZ, buf, i); return i; } /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ static int dev_prep_valid_name(struct net *net, struct net_device *dev, const char *want_name, char *out_name, int dup_errno) { if (!dev_valid_name(want_name)) return -EINVAL; if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name); if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name) strscpy(out_name, want_name, IFNAMSIZ); return 0; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { int ret; ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); return ret < 0 ? ret : 0; } /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; ASSERT_RTNL_NET(net); if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; memcpy(oldname, dev->name, IFNAMSIZ); write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); write_sequnlock_bh(&netdev_rename_lock); if (err < 0) return err; if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); synchronize_net(); netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { netdev_err(dev, "name change rollback failed: %d\n", ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); new_alias = rcu_replace_pointer(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void __netdev_notify_peers(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } EXPORT_SYMBOL(__netdev_notify_peers); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int napi_threaded_poll(void *data); static int napi_kthread_create(struct napi_struct *n) { int err = 0; /* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg. */ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name, n->napi_id); if (IS_ERR(n->thread)) { err = PTR_ERR(n->thread); pr_err("kthread_run failed with err %d\n", err); n->thread = NULL; } return err; } static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent) pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV; } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev, extack); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); netif_set_up(dev, false); netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int call_netdevice_register_notifiers(struct notifier_block *nb, struct net_device *dev) { int err; err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) return err; if (!(dev->flags & IFF_UP)) return 0; call_netdevice_notifier(nb, NETDEV_UP, dev); return 0; } static void call_netdevice_unregister_notifiers(struct notifier_block *nb, struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } static int call_netdevice_register_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; int err; for_each_netdev(net, dev) { err = call_netdevice_register_notifiers(nb, dev); if (err) goto rollback; } return 0; rollback: for_each_netdev_continue_reverse(net, dev) call_netdevice_unregister_notifiers(nb, dev); return err; } static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; for_each_netdev(net, dev) call_netdevice_unregister_notifiers(nb, dev); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); __rtnl_net_unlock(net); if (err) goto rollback; } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: for_each_net_continue_reverse(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); static int __register_netdevice_notifier_net(struct net *net, struct notifier_block *nb, bool ignore_call_fail) { int err; err = raw_notifier_chain_register(&net->netdev_chain, nb); if (err) return err; if (dev_boot_phase) return 0; err = call_netdevice_register_net_notifiers(nb, net); if (err && !ignore_call_fail) goto chain_unregister; return 0; chain_unregister: raw_notifier_chain_unregister(&net->netdev_chain, nb); return err; } static int __unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; err = raw_notifier_chain_unregister(&net->netdev_chain, nb); if (err) return err; call_netdevice_unregister_net_notifiers(nb, net); return 0; } /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); /** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); static void __move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, struct notifier_block *nb) { __unregister_netdevice_notifier_net(src_net, nb); __register_netdevice_notifier_net(dst_net, nb, true); } int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { struct net *net = dev_net(dev); int err; rtnl_net_lock(net); list_del(&nn->list); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); static void move_netdevice_notifiers_dev_net(struct net_device *dev, struct net *net) { struct netdev_net_notifier *nn; list_for_each_entry(nn, &dev->net_notifier_list, list) __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; ASSERT_RTNL(); /* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns. */ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust(). */ static int call_netdevice_notifiers_info_robust(unsigned long val_up, unsigned long val_down, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); ASSERT_RTNL(); return raw_notifier_call_chain_robust(&net->netdev_chain, val_up, val_down, info); } static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_info info = { .dev = dev, .extack = extack, }; return call_netdevice_notifiers_info(val, &info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 0) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 1) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, bool check_mtu) { int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, true); } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /** * dev_nit_active - return true if any network interface taps are in use * * @dev: network device to check for the presence of taps */ bool dev_nit_active(struct net_device *dev) { return !list_empty(&net_hotdata.ptype_all) || !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static struct static_key xps_needed __read_mostly; static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } if (old_maps) RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev_maps->num_tc; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, enum xps_map_type type) { static_key_slow_dec_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_maps[type], NULL); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, u16 offset, u16 count) { struct xps_dev_maps *dev_maps; bool active = false; int i, j; dev_maps = xmap_dereference(dev->xps_maps[type]); if (!dev_maps) return; for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, type); if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) clean_xps_maps(dev, XPS_RXQS, offset, count); clean_xps_maps(dev, XPS_CPUS, offset, count); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Copy xps maps at a given index */ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc) { int i, tci = index * dev_maps->num_tc; struct xps_map *map; /* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue; /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsigned int nr_ids; WARN_ON_ONCE(index >= dev->num_tx_queues); if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps[type]); if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match. */ if (dev_maps && dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) { new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = 0; j < nr_ids; j++) { bool skip_tc = false; tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; skip_tc = true; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } if (copy) xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) continue; if (copy) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); if (map == new_map) continue; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { tci = j * dev_maps->num_tc; for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && netif_attr_test_mask(j, mask, dev_maps->nr_ids) && netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; active |= remove_xps_queue(dev_maps, copy ? old_dev_maps : NULL, tci, index); } } if (old_dev_maps) kfree_rcu(old_dev_maps, rcu); /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); net_shaper_set_real_num_tx_queues(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct. */ int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq) { unsigned int old_rxq = dev->real_num_rx_queues; int err; if (txq < 1 || txq > dev->num_tx_queues || rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; /* Start from increases, so the error path only does decreases - * decreases can't fail. */ if (rxq > dev->real_num_rx_queues) { err = netif_set_real_num_rx_queues(dev, rxq); if (err) return err; } if (txq > dev->real_num_tx_queues) { err = netif_set_real_num_tx_queues(dev, txq); if (err) goto undo_rx; } if (rxq < dev->real_num_rx_queues) WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); if (txq < dev->real_num_tx_queues) WARN_ON(netif_set_real_num_tx_queues(dev, txq)); return 0; undo_rx: WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); return err; } EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size)) netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); /** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. */ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) { dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs)) netif_set_gso_max_segs(dev, segs); } EXPORT_SYMBOL(netif_set_tso_max_segs); /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits */ void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) { netif_set_tso_max_size(to, from->tso_max_size); netif_set_tso_max_segs(to, from->tso_max_segs); } EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { cpumask_var_t cpus; int cpu, count = 0; if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1; cpumask_copy(cpus, cpu_online_mask); for_each_cpu(cpu, cpus) { ++count; cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); } free_cpumask_var(cpus); return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(dev_kfree_skb_irq_reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) dev_kfree_skb_irq_reason(skb, reason); else kfree_skb_reason(skb, reason); } EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } skb_dump(KERN_WARNING, skb, false); WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } if (!skb_frags_readable(skb)) { return -EFAULT; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out; crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return vlan_get_protocol_and_depth(skb, type, depth); } /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (page && PageHighMem(page)) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { __be16 type; type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; } /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM) return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && skb_network_header_len(skb) != sizeof(struct ipv6hdr) && !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): return 0; } } sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: dev_core_stats_tx_dropped_inc(dev); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; /* mac layer + network layer */ hdr_len = skb_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { int payload = skb->len - hdr_len; /* Malicious packet. */ if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, struct sk_buff **to_free, struct netdev_queue *txq) { int rc; rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; if (rc == NET_XMIT_SUCCESS) trace_qdisc_enqueue(q, txq, skb); return rc; } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); goto no_lock_out; } qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); return rc; } if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return NET_XMIT_DROP; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock. */ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { int qm = skb_get_queue_mapping(skb); return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } #ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); } void netdev_xmit_skip_txqueue(bool skip) { __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #else static bool netdev_xmit_txqueue_skipped(void) { return current->net_xmit.skip_txqueue; } void netdev_xmit_skip_txqueue(bool skip) { current->net_xmit.skip_txqueue = skip; } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, enum skb_drop_reason *drop_reason) { int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); struct tcf_result res; if (!miniq) return ret; /* Global bypass */ if (!static_branch_likely(&tcf_sw_enabled_key)) return ret; /* Block-wise bypass */ if (tcf_block_bypass_sw(miniq->block)) return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(res.classid); break; } #endif /* CONFIG_NET_CLS_ACT */ return ret; } static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); void tcx_inc(void) { static_branch_inc(&tcx_needed_key); } void tcx_dec(void) { static_branch_dec(&tcx_needed_key); } static __always_inline enum tcx_action_base tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, const bool needs_mac) { const struct bpf_mprog_fp *fp; const struct bpf_prog *prog; int ret = TCX_NEXT; if (needs_mac) __skb_push(skb, skb->mac_len); bpf_mprog_foreach_prog(entry, fp, prog) { bpf_compute_data_pointers(skb); ret = bpf_prog_run(prog, skb); if (ret != TCX_NEXT) break; } if (needs_mac) __skb_pull(skb, skb->mac_len); return tcx_action_code(skb, ret); } static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev. */ __skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) { __skb_pull(skb, skb->mac_len); *another = true; break; } *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } #else static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { return skb; } #endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } EXPORT_SYMBOL(netdev_pick_tx); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { skb = nf_hook_egress(skb, &rc, dev); if (!skb) goto out; } netdev_xmit_skip_txqueue(false); nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); if (netdev_xmit_txqueue_skipped()) txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); return ret; drop: dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ static DEFINE_PER_CPU(struct task_struct *, backlog_napi); int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { struct task_struct *thread; lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) goto use_local_napi; set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u16 rxq_index; u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; WRITE_ONCE(rflow->filter, rc); if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); rps_input_queue_tail_save(&rflow->last_qtail, head); } WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ static void trigger_rx_softirq(void *data) { struct softnet_data *sd = data; __raise_softirq_irqoff(NET_RX_SOFTIRQ); smp_store_release(&sd->defer_ipi_scheduled, 0); } /* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ. */ static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { if (use_backlog_threads()) { __napi_schedule_irqoff(&sd->backlog); return; } sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); } void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) { unsigned long flags; if (use_backlog_threads()) { backlog_lock_irq_save(sd, &flags); if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); backlog_unlock_irq_restore(sd, &flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; int max_backlog; u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); if (unlikely(qlen > max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; u32 metalen, act; int off; /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); rxqueue = netif_get_rxqueue(skb); xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ } /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_host != ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP). */ switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; } return act; } static int netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) return 0; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); troom = skb->tail + skb->data_len - skb->end; err = pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC); if (err) return err; return skb_linearize(skb); } static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM * bytes. This is the guarantee that also native XDP provides, * thus we need to do it here as well. */ mac_len = skb->data - skb_mac_header(skb); __skb_push(skb, mac_len); if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } __skb_pull(*pskb, mac_len); act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: kfree_skb(*pskb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(*pskb, xdp_prog); break; } bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt). */ int __netif_rx(struct sk_buff *skb) { int ret; lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); return ret; } EXPORT_SYMBOL(__netif_rx); /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; if (need_bh_off) local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); if (need_bh_off) local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, get_kfree_skb_cb(skb)->reason); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); rcu_read_lock(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc. */ clear_bit(__QDISC_STATE_SCHED, &q->state); continue; } clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } rcu_read_unlock(); } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); #if !defined(CONFIG_DEBUG_NET) /* We plan to no longer reset the transport header here. * Give some time to fuzzers and dev build to catch bugs * in network stacks. */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); #endif skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), &skb); migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } } if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) goto another_round; if (!skb) goto out; nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; break; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. */ __vlan_hwaccel_clear_tag(skb); skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header. */ goto another_round; else if (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST. */ goto check_vlan_id; } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { int ret; trace_netif_receive_skb_entry(skb); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); return ret; } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); } netif_receive_skb_list_internal(head); trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct softnet_data *sd; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); } static bool flush_required(int cpu) { #if IS_ENABLED(CONFIG_RPS) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); backlog_unlock_irq_enable(sd); return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets. */ return true; } struct flush_backlogs { cpumask_t flush_cpus; struct work_struct w[]; }; static struct flush_backlogs *flush_backlogs_alloc(void) { return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), GFP_KERNEL); } static struct flush_backlogs *flush_backlogs_fallback; static DEFINE_MUTEX(flush_backlogs_mutex); static void flush_all_backlogs(void) { struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; if (!ptr) { mutex_lock(&flush_backlogs_mutex); ptr = flush_backlogs_fallback; } cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); for_each_online_cpu(cpu) { if (flush_required(cpu)) { INIT_WORK(&ptr->w[cpu], flush_backlog); queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them. */ for_each_cpu(cpu, &ptr->flush_cpus) flush_work(&ptr->w[cpu]); cpus_read_unlock(); if (ptr != flush_backlogs_fallback) kfree(ptr); else mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); if (++work >= quota) { rps_input_queue_head_add(sd, work); return work; } local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state &= NAPIF_STATE_THREADED; again = false; } else { local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } if (work) rps_input_queue_head_add(sd, work); return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new, timeout = 0; bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (work_done) { if (n->gro_bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); } gro_normal_list(n); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); return ret; } EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; napi_consume_skb(skb, 1); skb = next; } } #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { gro_normal_list(napi); __napi_schedule(napi); return; } if (napi->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(napi, HZ >= 1000); } gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } enum { NAPI_F_PREFER_BUSY_POLL = 1, NAPI_F_END_ON_RESCHED = 2, }; static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; } } /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } static void __napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); restart: napi_poll = NULL; napi = napi_by_id(napi_id); if (!napi) return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); for (;;) { int work = 0; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (flags & NAPI_F_END_ON_RESCHED) break; if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = NAPI_F_END_ON_RESCHED; if (prefer_busy_poll) flags |= NAPI_F_PREFER_BUSY_POLL; __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; rcu_read_lock(); __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); void napi_suspend_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { unsigned long timeout = napi_get_irq_suspend_timeout(napi); if (timeout) hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } rcu_read_unlock(); } void napi_resume_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { /* If irq_suspend_timeout is set to 0 between the call to * napi_suspend_irqs and now, the original value still * determines the safety timeout as intended and napi_watchdog * will resume irq processing. */ if (napi_get_irq_suspend_timeout(napi)) { local_bh_disable(); napi_schedule(napi); local_bh_enable(); } } rcu_read_unlock(); } #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); WARN_ON_ONCE(napi_by_id(napi_id)); __napi_hash_add_with_id(napi, napi_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } static void napi_hash_add(struct napi_struct *napi) { unsigned long flags; if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ static void napi_hash_del(struct napi_struct *napi) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); } return HRTIMER_NORESTART; } static void init_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } napi->gro_bitmask = 0; } int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); if (dev->threaded == threaded) return 0; if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { threaded = false; break; } } } } WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. */ smp_mb__before_atomic(); /* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ list_for_each_entry(napi, &dev->napi_list, dev_list) assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } EXPORT_SYMBOL(dev_set_threaded); /** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector. */ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; if (WARN_ON_ONCE(napi && !napi->dev)) return; if (dev->reg_state >= NETREG_REGISTERED) ASSERT_RTNL(); switch (type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(dev, queue_index); rxq->napi = napi; return; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(dev, queue_index); txq->napi = napi; return; default: return; } } EXPORT_SYMBOL(netif_queue_set_napi); static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); } else { napi_hash_add(n); n->config->napi_id = n->napi_id; } } static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; napi_hash_del(n); } /* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will * inherit an existing ID try to insert it at the right position. */ static void netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) { unsigned int new_id, pos_id; struct list_head *higher; struct napi_struct *pos; new_id = UINT_MAX; if (napi->config && napi->config->napi_id) new_id = napi->config->napi_id; higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { if (pos->napi_id >= MIN_NAPI_ID) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; else pos_id = UINT_MAX; if (pos_id <= new_id) break; higher = &pos->dev_list; } list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable */ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = false; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); netdev_assert_locked(n->dev); set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); if (n->config) napi_save_config(n); else napi_hash_del(n); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable_locked); /** * napi_disable() - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. * Takes netdev_lock() for associated net_device. */ void napi_disable(struct napi_struct *n) { netdev_lock(n->dev); napi_disable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_disable); void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); if (n->config) napi_restore_config(n); else napi_hash_add(n); do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable_locked); /** * napi_enable() - enable NAPI scheduling * @n: NAPI context * * Enable scheduling of a NAPI instance. * Must be paired with napi_disable(). * Takes netdev_lock() for associated net_device. */ void napi_enable(struct napi_struct *n) { netdev_lock(n->dev); napi_enable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_enable); static void flush_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); napi->gro_hash[i].count = 0; } } /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { netdev_assert_locked(napi->dev); if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; if (napi->config) { napi->index = -1; napi->config = NULL; } list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { int work, weight; weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); xdp_do_check_flushed(n); } if (unlikely(work > weight)) netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", n->poll, work, weight); if (likely(work < weight)) return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); return work; } /* The NAPI context has more processing work, but busy-polling * is preferred. Exit early. */ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled. */ napi_schedule(n); } return work; } if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(n, HZ >= 1000); } gro_normal_list(n); /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); return work; } *repoll = true; return work; } static int napi_poll(struct napi_struct *n, struct list_head *repoll) { bool do_repoll = false; void *have; int work; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); work = __napi_poll(n, &do_repoll); if (do_repoll) list_add_tail(&n->poll_list, repoll); netpoll_poll_unlock(have); return work; } static int napi_thread_wait(struct napi_struct *napi) { set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return -1; } static void napi_threaded_poll_loop(struct napi_struct *napi) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; for (;;) { bool repoll = false; void *have; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); sd->in_napi_threaded_poll = false; barrier(); if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) break; rcu_softirq_qs_periodic(last_qs); cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; while (!napi_thread_wait(napi)) napi_threaded_poll_loop(napi); return 0; } static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; skb_defer_free_flush(sd); if (list_empty(&list)) { if (list_empty(&repoll)) { sd->in_net_rx_action = false; barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true. */ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end; } break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); else sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end: bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { struct net_device *dev; netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; /* lookup ignore flag */ bool ignore; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int ____netdev_has_upper_dev(struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master) && !upper->ignore) return upper->dev; return NULL; } /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *__netdev_next_upper_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; *ignore = upper->ignore; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = __netdev_next_upper_dev(now, &iter, &ignore); if (!udev) break; if (ignore) continue; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = 0, .data = (void *)upper_dev, }; ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, &priv); } /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static struct net_device *__netdev_next_lower_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; *ignore = lower->ignore; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = __netdev_next_lower_dev(now, &iter, &ignore); if (!ldev) break; if (ignore) continue; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.upper, udev = __netdev_next_upper_dev(dev, &iter, &ignore); udev; udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.lower, ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); static void net_unlink_todo(struct net_device *dev) { if (list_empty(&dev->unlink_list)) list_add_tail(&dev->unlink_list, &net_unlink_list); } #endif static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; #ifdef CONFIG_LOCKDEP if (!priv) return 0; if (priv->flags & NESTED_SYNC_IMM) dev->nested_level = dev->lower_level - 1; if (priv->flags & NESTED_SYNC_TODO) net_unlink_todo(dev); #endif return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; adj->ignore = false; netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); } /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_TODO, .data = NULL, }; __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); static void __netdev_adjacent_dev_set(struct net_device *upper_dev, struct net_device *lower_dev, bool val) { struct netdev_adjacent *adj; adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); if (adj) adj->ignore = val; adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); if (adj) adj->ignore = val; } static void netdev_adjacent_dev_disable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, true); } static void netdev_adjacent_dev_enable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, false); } int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; int err; if (!new_dev) return 0; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); return err; } return 0; } EXPORT_SYMBOL(netdev_adjacent_change_prepare); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; if (!new_dev || !old_dev) return; if (new_dev == old_dev) return; netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static int netdev_offload_xstats_enable_l3(struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; int err; int rc; dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), GFP_KERNEL); if (!dev->offload_xstats_l3) return -ENOMEM; rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); err = notifier_to_errno(rc); if (err) goto free_stats; return 0; free_stats: kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; return err; } int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return netdev_offload_xstats_enable_l3(dev, extack); } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_enable); static void netdev_offload_xstats_disable_l3(struct net_device *dev) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; } int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); if (!netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: netdev_offload_xstats_disable_l3(dev); return 0; } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_disable); static void netdev_offload_xstats_disable_all(struct net_device *dev) { netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); } static struct rtnl_hw_stats64 * netdev_offload_xstats_get_ptr(const struct net_device *dev, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return dev->offload_xstats_l3; } WARN_ON(1); return NULL; } bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); return netdev_offload_xstats_get_ptr(dev, type); } EXPORT_SYMBOL(netdev_offload_xstats_enabled); struct netdev_notifier_offload_xstats_ru { bool used; }; struct netdev_notifier_offload_xstats_rd { struct rtnl_hw_stats64 stats; bool used; }; static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, const struct rtnl_hw_stats64 *src) { dest->rx_packets += src->rx_packets; dest->tx_packets += src->tx_packets; dest->rx_bytes += src->rx_bytes; dest->tx_bytes += src->tx_bytes; dest->rx_errors += src->rx_errors; dest->tx_errors += src->tx_errors; dest->rx_dropped += src->rx_dropped; dest->tx_dropped += src->tx_dropped; dest->multicast += src->multicast; } static int netdev_offload_xstats_get_used(struct net_device *dev, enum netdev_offload_xstats_type type, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_ru report_used = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_used = &report_used, }; int rc; WARN_ON(!netdev_offload_xstats_enabled(dev, type)); rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, &info.info); *p_used = report_used.used; return notifier_to_errno(rc); } static int netdev_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_rd report_delta = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_delta = &report_delta, }; struct rtnl_hw_stats64 *stats; int rc; stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return -EINVAL; rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, &info.info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ netdev_hw_stats64_add(stats, &report_delta.stats); if (p_stats) *p_stats = *stats; *p_used = report_delta.used; return notifier_to_errno(rc); } int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (p_stats) return netdev_offload_xstats_get_stats(dev, type, p_stats, p_used, extack); else return netdev_offload_xstats_get_used(dev, type, p_used, extack); } EXPORT_SYMBOL(netdev_offload_xstats_get); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, const struct rtnl_hw_stats64 *stats) { report_delta->used = true; netdev_hw_stats64_add(&report_delta->stats, stats); } EXPORT_SYMBOL(netdev_offload_xstats_report_delta); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) { report_used->used = true; } EXPORT_SYMBOL(netdev_offload_xstats_report_used); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *p_stats) { struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return; netdev_hw_stats64_add(stats, p_stats); } EXPORT_SYMBOL(netdev_offload_xstats_push_delta); /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found. */ struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_xmit_slave) return NULL; return ops->ndo_get_xmit_slave(dev, skb, all_slaves); } EXPORT_SYMBOL(netdev_get_xmit_slave); static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk); } /** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found. */ struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk) { struct net_device *lower; lower = netdev_sk_get_lower_dev(dev, sk); while (lower) { dev = lower; lower = netdev_sk_get_lower_dev(dev, sk); } return dev; } EXPORT_SYMBOL(netdev_sk_get_lowest_dev); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); /** * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); promiscuity = dev->promiscuity + inc; if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_PROMISC; } else { flags = old_flags | IFF_PROMISC; } WRITE_ONCE(dev->promiscuity, promiscuity); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; ASSERT_RTNL(); allmulti = dev->allmulti + inc; if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_ALLMULTI; } else { flags = old_flags | IFF_ALLMULTI; } WRITE_ONCE(dev->allmulti, allmulti); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags, 0, NULL); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. * @dev: device * @addr: new address * @extack: netlink extended ack */ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, .info.extack = extack, .dev_addr = addr, }; int rc; rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } EXPORT_SYMBOL(dev_pre_changeaddr_notify); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { err = ops->ndo_set_mac_address(dev, sa); if (err) return err; } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); ret = dev_set_mac_address(dev, sa, extack); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; down_read(&dev_addr_sem); rcu_read_lock(); dev = dev_get_by_name_rcu(net, dev_name); if (!dev) { ret = -ENODEV; goto unlock; } if (!dev->addr_len) memset(sa->sa_data, 0, size); else memcpy(sa->sa_data, dev->dev_addr, min_t(size_t, size, dev->addr_len)); sa->sa_family = dev->type; unlock: rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_get_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (ops->ndo_get_phys_port_name) { err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err; } return devlink_compat_phys_port_name_get(dev, name, len); } /** * dev_get_port_parent_id - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier */ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err; if (ops->ndo_get_port_parent_id) { err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err; } err = devlink_compat_switch_id_get(dev, ppid); if (!recurse || err != -EOPNOTSUPP) return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) first = *ppid; else if (memcmp(&first, ppid, sizeof(*ppid))) return -EOPNOTSUPP; } return err; } EXPORT_SYMBOL(dev_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have * the same port parent identifier * @a: first network device * @b: second network device */ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; if (dev_get_port_parent_id(a, &a_id, true) || dev_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); /** * dev_change_proto_down - set carrier according to proto_down. * * @dev: device * @proto_down: new value */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** * dev_change_proto_down_reason - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { u32 proto_down_reason; int b; if (!mask) { proto_down_reason = value; } else { proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) proto_down_reason |= BIT(b); else proto_down_reason &= ~BIT(b); } } WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { struct bpf_link link; struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ int flags; }; static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { if (flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; if (flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_DRV; if (flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { switch (mode) { case XDP_MODE_SKB: return generic_xdp_install; case XDP_MODE_DRV: case XDP_MODE_HW: return dev->netdev_ops->ndo_bpf; default: return NULL; } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, enum bpf_xdp_mode mode) { return dev->xdp_state[mode].link; } static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_xdp_link *link = dev_xdp_link(dev, mode); if (link) return link->link.prog; return dev->xdp_state[mode].prog; } u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link) count++; return count; } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u8 dev_xdp_sb_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog && !dev->xdp_state[i].prog->aux->xdp_has_frags) count++; return count; } int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; } return dev->netdev_ops->ndo_bpf(dev, bpf); } EXPORT_SYMBOL_GPL(dev_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); return prog ? prog->aux->id : 0; } static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_xdp_link *link) { dev->xdp_state[mode].link = link; dev->xdp_state[mode].prog = NULL; } static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; int err; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; } memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; /* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it. */ if (prog) bpf_prog_inc(prog); err = bpf_op(dev, &xdp); if (err) { if (prog) bpf_prog_put(prog); return err; } if (mode != XDP_MODE_HW) bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); return 0; } static void dev_xdp_uninstall(struct net_device *dev) { struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { prog = dev_xdp_prog(dev, mode); if (!prog) continue; bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) continue; WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); /* auto-detach link from net device */ link = dev_xdp_link(dev, mode); if (link) link->dev = NULL; else bpf_prog_put(prog); dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; struct net_device *upper; struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; ASSERT_RTNL(); /* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) { NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) { NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL; } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL; } mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) { NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY; } /* don't allow if an upper device already has a program */ netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) { NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST; } } cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY; } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } /* put effective new program into new_prog */ if (link) new_prog = link->link.prog; if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL; } } /* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) { bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) { NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); if (err) return err; } if (link) dev_xdp_set_link(dev, mode, link); else dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } static int dev_xdp_attach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); } static int dev_xdp_detach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); mode = dev_xdp_mode(dev, link->flags); if (dev_xdp_link(dev, mode) != link) return -EINVAL; bpf_op = dev_xdp_bpf_op(dev, mode); WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); dev_xdp_set_link(dev, mode, NULL); return 0; } static void bpf_xdp_link_release(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); rtnl_lock(); /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); xdp_link->dev = NULL; } rtnl_unlock(); } static int bpf_xdp_link_detach(struct bpf_link *link) { bpf_xdp_link_release(link); return 0; } static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); kfree(xdp_link); } static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); } static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); info->xdp.ifindex = ifindex; return 0; } static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err = 0; rtnl_lock(); /* link might have been auto-released already, so fail */ if (!xdp_link->dev) { err = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { err = -EPERM; goto out_unlock; } old_prog = link->prog; if (old_prog->type != new_prog->type || old_prog->expected_attach_type != new_prog->expected_attach_type) { err = -EINVAL; goto out_unlock; } if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); goto out_unlock; } mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); if (err) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: rtnl_unlock(); return err; } static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { rtnl_unlock(); return -EINVAL; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); link->dev = dev; link->flags = attr->link_create.flags; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto unlock; } err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */ dev_put(dev); return fd; unlock: rtnl_unlock(); out_put_dev: dev_put(dev); return err; } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err; ASSERT_RTNL(); if (fd >= 0) { new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog); } if (expected_fd >= 0) { old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(old_prog)) { err = PTR_ERR(old_prog); old_prog = NULL; goto err_out; } } err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) bpf_prog_put(new_prog); if (old_prog) bpf_prog_put(old_prog); return err; } u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; ASSERT_RTNL(); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) /* The channel count is the idx plus 1. */ return i + 1; return 0; } /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno. */ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; if (ifindex > INT_MAX) { DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL; } if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); else err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); if (err < 0) return err; return ifindex; } static void dev_index_release(struct net *net, int ifindex) { /* Expect only unused indexes, unlist_netdevice() removes the used */ WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS return current == cleanup_net_task; #else return false; #endif } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); else netdev_features_change(lower); } } } static bool netdev_has_ip_or_hw_csum(netdev_features_t features) { netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; bool hw_csum = features & NETIF_F_HW_CSUM; return ip_csum || hw_csum; } static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IP_CSUM)) { netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); features &= ~NETIF_F_TSO; features &= ~NETIF_F_TSO_ECN; } if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IPV6_CSUM)) { netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); features &= ~NETIF_F_TSO6; } /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) features &= ~NETIF_F_TSO_MANGLEID; /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n"); features &= ~dev->gso_partial_features; } if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW. */ if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); features &= ~NETIF_F_GRO_HW; } } /* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) { netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_GRO_HW; } } if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); features &= ~NETIF_F_LRO; } if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); features &= ~NETIF_F_HW_TLS_RX; } if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); features &= ~NETIF_F_GSO_UDP_L4; } return features; } int __netdev_update_features(struct net_device *dev) { struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; int err = -1; ASSERT_RTNL(); features = netdev_get_wanted_features(dev); if (dev->netdev_ops->ndo_fix_features) features = dev->netdev_ops->ndo_fix_features(dev, features); /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); else err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it */ return -1; } sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); if (!err) { netdev_features_t diff = features ^ dev->features; if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { dev->features = features; err |= vlan_get_rx_ctag_filter_info(dev); } else { vlan_drop_rx_ctag_filter_info(dev); } } if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { if (features & NETIF_F_HW_VLAN_STAG_FILTER) { dev->features = features; err |= vlan_get_rx_stag_filter_info(dev); } else { vlan_drop_rx_stag_filter_info(dev); } } dev->features = features; } return err < 0 ? 0 : 1; } /** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features. */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); /** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices. */ void netdev_change_features(struct net_device *dev) { __netdev_update_features(dev); netdev_features_change(dev); } EXPORT_SYMBOL(netdev_change_features); /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device). */ void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev) { if (rootdev->operstate == IF_OPER_DORMANT) netif_dormant_on(dev); else netif_dormant_off(dev); if (rootdev->operstate == IF_OPER_TESTING) netif_testing_on(dev); else netif_testing_off(dev); if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); int err = 0; BUG_ON(count < 1); rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; for (i = 0; i < count; i++) { rx[i].dev = dev; /* XDP RX-queue setup */ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } return 0; err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); kvfree(dev->_rx); dev->_rx = NULL; return err; } static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; for (i = 0; i < count; i++) xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL dql_init(&queue->dql, HZ); #endif } static void netif_free_tx_queues(struct net_device *dev) { kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; size_t sz = count * sizeof(*tx); if (count < 1 || count > 0xffff) return -EINVAL; tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); return 0; } void netif_tx_stop_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_stop_queue(txq); } } EXPORT_SYMBOL(netif_tx_stop_all_queues); static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; /* Drivers implementing ndo_get_peer_dev must support tstat * accounting, so that skb_do_redirect() can bump the dev's * RX stats upon network namespace switch. */ if (dev->netdev_ops->ndo_get_peer_dev && dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) return -EOPNOTSUPP; switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; case NETDEV_PCPU_STAT_LSTATS: v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); break; case NETDEV_PCPU_STAT_TSTATS: v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); break; case NETDEV_PCPU_STAT_DSTATS: v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); break; default: return -EINVAL; } return v ? 0 : -ENOMEM; } static void netdev_do_free_pcpu_stats(struct net_device *dev) { switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return; case NETDEV_PCPU_STAT_LSTATS: free_percpu(dev->lstats); break; case NETDEV_PCPU_STAT_TSTATS: free_percpu(dev->tstats); break; case NETDEV_PCPU_STAT_DSTATS: free_percpu(dev->dstats); break; } } static void netdev_free_phy_link_topology(struct net_device *dev) { struct phy_link_topology *topo = dev->link_topo; if (IS_ENABLED(CONFIG_PHYLIB) && topo) { xa_destroy(&topo->phys); kfree(topo); dev->link_topo = NULL; } } /** * register_netdevice() - register a network device * @dev: device to register * * Take a prepared network device structure and make it externally accessible. * A %NETDEV_REGISTER message is sent to the netdev notifier chain. * Callers must hold the rtnl lock - you may want register_netdev() * instead of this. */ int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); might_sleep(); /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); ret = ethtool_check_ops(dev->ethtool_ops); if (ret) return ret; /* rss ctx ID 0 is reserved for the default context, start from 1 */ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); mutex_init(&dev->ethtool->rss_lock); spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; goto err_free_name; } } if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; goto err_uninit; } ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit; ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; /* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later. */ if (dev->hw_features & NETIF_F_TSO) dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO) dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO) dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; /* Make NETIF_F_SG inheritable to tunnel devices. */ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ dev->mpls_features |= NETIF_F_SG; ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) goto err_ifindex_release; ret = netdev_register_kobject(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); netdev_unlock(dev); if (ret) goto err_uninit_notify; __netdev_update_features(dev); /* * Default initial state at registry is that the * device is present. */ set_bit(__LINK_STATE_PRESENT, &dev->state); linkwatch_init_dev(dev); dev_init_scheduler(dev); netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value). */ if (dev->addr_assign_type == NET_ADDR_PERM) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ dev->needs_free_netdev = false; unregister_netdevice_queue(dev, NULL); goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); err_free_pcpu: netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); err_free_name: netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); /* Initialize the core of a dummy net device. * The setup steps dummy netdevs need which normal netdevs get by going * through register_netdevice(). */ static void init_dummy_netdev(struct net_device *dev) { /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ } /** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ int register_netdev(struct net_device *dev) { struct net *net = dev_net(dev); int err; if (rtnl_net_lock_killable(net)) return -EINTR; err = register_netdevice(dev); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { #ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; #else return refcount_read(&dev->dev_refcnt); #endif } EXPORT_SYMBOL(netdev_refcnt_read); int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs_any - wait until all references are gone. * @list: list of net_devices to wait on * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; struct net_device *dev; int wait = 0; rebroadcast_time = warning_time = jiffies; list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ list_for_each_entry(dev, list, todo_list) call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); list_for_each_entry(dev, list, todo_list) if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device. */ linkwatch_run_queue(); break; } __rtnl_unlock(); rebroadcast_time = jiffies; } rcu_barrier(); if (!wait) { wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; if (time_after(jiffies, warning_time + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); ref_tracker_dir_print(&dev->refcnt_tracker, 10); } warning_time = jiffies; } } } /* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed. */ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { netdev_WARN(dev, "run_todo but not unregistering\n"); list_del(&dev->todo_list); continue; } netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); netdev_unlock(dev); linkwatch_sync_dev(dev); } cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) wake_up(&netdev_unregistering_wq); } /* Collate per-cpu network dstats statistics * * Read per-cpu network statistics from dev->dstats and populate the related * fields in @s. */ static void dev_fetch_dstats(struct rtnl_link_stats64 *s, const struct pcpu_dstats __percpu *dstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, rx_drops; u64 tx_packets, tx_bytes, tx_drops; const struct pcpu_dstats *stats; unsigned int start; stats = per_cpu_ptr(dstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); rx_drops = u64_stats_read(&stats->rx_drops); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); tx_drops = u64_stats_read(&stats->tx_drops); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->rx_dropped += rx_drops; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; s->tx_dropped += tx_drops; } } /* ndo_get_stats64 implementation for dtstats-based accounting. * * Populate @s from dev->stats and dev->dstats. This is used internally by the * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. */ static void dev_get_dstats64(const struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_dstats(s, dev->dstats); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( struct net_device *dev) { struct net_device_core_stats __percpu *p; p = alloc_percpu_gfp(struct net_device_core_stats, GFP_ATOMIC | __GFP_NOWARN); if (p && cmpxchg(&dev->core_stats, NULL, p)) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ return READ_ONCE(dev->core_stats); } noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); unsigned long __percpu *field; if (unlikely(!p)) { p = netdev_core_stats_alloc(dev); if (!p) return; } field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); /** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used. */ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; /* * IPv{4,6} and udp tunnels share common stat helpers and use * different stat type (NETDEV_PCPU_STAT_TSTATS vs * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. */ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != offsetof(struct pcpu_dstats, rx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != offsetof(struct pcpu_dstats, rx_packets)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != offsetof(struct pcpu_dstats, tx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != offsetof(struct pcpu_dstats, tx_packets)); if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { dev_get_tstats64(dev, storage); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ p = READ_ONCE(dev->core_stats); if (p) { const struct net_device_core_stats *core_stats; int i; for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); } } return storage; } EXPORT_SYMBOL(dev_get_stats); /** * dev_fetch_sw_netstats - get per-cpu network device statistics * @s: place to store stats * @netstats: per-cpu network stats to read from * * Read per-cpu network statistics and populate the related fields in @s. */ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, tx_packets, tx_bytes; const struct pcpu_sw_netstats *stats; unsigned int start; stats = per_cpu_ptr(netstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; } } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); /** * dev_get_tstats64 - ndo_get_stats64 implementation * @dev: device to get statistics from * @s: place to store stats * * Populate @s from dev->stats and dev->tstats. Can be used as * ndo_get_stats64() callback. */ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_sw_netstats(s, dev->tstats); } EXPORT_SYMBOL_GPL(dev_get_tstats64); struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; } static const struct ethtool_ops default_ethtool_ops; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops) { if (dev->ethtool_ops == &default_ethtool_ops) dev->ethtool_ops = ops; } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default * @dev: netdev to enable the IRQ coalescing on * * Sets a conservative default for SW IRQ coalescing. Users can use * sysfs attributes to override the default values. */ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { netdev_set_gro_flush_timeout(dev, 20000); netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t napi_config_sz; unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } maxqs = max(txqs, rxqs); dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) return NULL; dev->priv_len = sizeof_priv; ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP dev->nested_level = 0; INIT_LIST_HEAD(&dev->unlink_list); #endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif mutex_init(&dev->lock); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); if (!dev->ethtool) goto free_all; dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg) goto free_all; dev->cfg_pending = dev->cfg; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) goto free_all; strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; nf_hook_netdev_init(dev); return dev; free_all: free_netdev(dev); return NULL; free_pcpu: #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: #endif kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); static void netdev_napi_exit(struct net_device *dev) { if (!list_empty(&dev->napi_list)) { struct napi_struct *p, *n; netdev_lock(dev); list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) __netif_napi_del_locked(p); netdev_unlock(dev); synchronize_net(); } kvfree(dev->napi_config); } /** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context. */ void free_netdev(struct net_device *dev) { might_sleep(); /* When called immediately after register_netdevice() failed the unwind * handling may still be dismantling the device. Handle that case by * deferring the free. */ if (dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); dev->needs_free_netdev = true; return; } WARN_ON(dev->cfg != dev->cfg_pending); kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); netdev_napi_exit(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif free_percpu(dev->core_stats); dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; netdev_free_phy_link_topology(dev); mutex_destroy(&dev->lock); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); } EXPORT_SYMBOL(free_netdev); /** * alloc_netdev_dummy - Allocate and initialize a dummy net device. * @sizeof_priv: size of private data to allocate space for * * Return: the allocated net_device on success, NULL otherwise */ struct net_device *alloc_netdev_dummy(int sizeof_priv) { return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, init_dummy_netdev); } EXPORT_SYMBOL_GPL(alloc_netdev_dummy); /** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting. */ void synchronize_net(void) { might_sleep(); if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); static void netdev_rss_contexts_free(struct net_device *dev) { struct ethtool_rxfh_context *ctx; unsigned long context; mutex_lock(&dev->ethtool->rss_lock); xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { struct ethtool_rxfh_param rxfh; rxfh.indir = ethtool_rxfh_context_indir(ctx); rxfh.key = ethtool_rxfh_context_key(ctx); rxfh.hfunc = ctx->hfunc; rxfh.input_xfrm = ctx->input_xfrm; rxfh.rss_context = context; rxfh.rss_delete = true; xa_erase(&dev->ethtool->rss_ctx, context); if (dev->ethtool_ops->create_rxfh_context) dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); else dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); kfree(ctx); } xa_destroy(&dev->ethtool->rss_ctx); mutex_unlock(&dev->ethtool->rss_lock); } /** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); if (head) { list_move_tail(&dev->unreg_list, head); } else { LIST_HEAD(single); list_add(&dev->unreg_list, &single); unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); if (list_empty(head)) return; list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); WARN_ON(1); list_del(&dev->unreg_list); continue; } dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } flush_all_backlogs(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); netdev_rss_contexts_free(dev); call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); mutex_destroy(&dev->ethtool->rss_lock); net_shaper_flush_netdev(dev); if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); #ifdef CONFIG_XPS /* Remove XPS queueing entries */ netif_reset_xps_queues_gt(dev, 0); #endif } synchronize_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); cnt++; } atomic_add(cnt, &dev_unreg_count); list_del(head); } /** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice. */ void unregister_netdev(struct net_device *dev) { struct net *net = dev_net(dev); rtnl_net_lock(net); unregister_netdevice(dev); rtnl_net_unlock(net); } EXPORT_SYMBOL(unregister_netdev); /** * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * @new_ifindex: If not zero, specifies device index in the target * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. */ int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); char new_name[IFNAMSIZ] = {}; int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; if (dev->netns_local) goto out; /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out; /* Get out if there is nothing todo */ err = 0; if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); if (err < 0) goto out; } /* Check that none of the altnames conflicts. */ err = -EEXIST; netdev_for_each_altname(dev, name_node) if (netdev_name_in_use(net, name_node->name)) goto out; /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); if (err < 0) goto out; } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); if (err < 0) goto out; new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ /* If device is running close it first. */ dev_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); /* Move per-net netdevice notifiers that are following the netdevice */ move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; if (new_name[0]) { /* Rename the netdev to prepared name */ write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ err = netdev_change_owner(dev, net_old, net); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); oldsd = &per_cpu(softnet_data, oldcpu); /* Find end of our completion_queue. */ list_skb = &sd->completion_queue; while (*list_skb) list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */ *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; /* Append output queue from offline CPU. */ if (oldsd->output_queue) { *sd->output_queue_tailp = oldsd->output_queue; sd->output_queue_tailp = oldsd->output_queue_tailp; oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later. */ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct, poll_list); list_del_init(&napi->poll_list); if (napi->poll == process_backlog) napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); if (!use_backlog_threads()) { #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */ net_rps_send_ipi(remsd); } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } return 0; } /** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM) all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name; net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx; xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; err_idx: kfree(net->dev_name_head); err_name: return -ENOMEM; } /** * netdev_drivername - network driver for the device * @dev: network device * * Determine network driver for device. */ const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; const char *empty = ""; parent = dev->dev.parent; if (!parent) return empty; driver = parent->driver; if (driver && driver->name) return driver->name; return empty; } static void __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { if (dev && dev->dev.parent) { dev_printk_emit(level[1] - '0', dev->dev.parent, "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), netdev_reg_state(dev), vaf); } else if (dev) { printk("%s%s%s: %pV", level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { printk("%s(NULL net_device): %pV", level, vaf); } } void netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __netdev_printk(level, dev, &vaf); va_end(args); } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ void func(const struct net_device *dev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_netdev_printk_level(netdev_emerg, KERN_EMERG); define_netdev_printk_level(netdev_alert, KERN_ALERT); define_netdev_printk_level(netdev_crit, KERN_CRIT); define_netdev_printk_level(netdev_err, KERN_ERR); define_netdev_printk_level(netdev_warn, KERN_WARNING); define_netdev_printk_level(netdev_notice, KERN_NOTICE); define_netdev_printk_level(netdev_info, KERN_INFO); static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, }; static void __net_exit default_device_exit_net(struct net *net) { struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); BUG(); } } } static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ struct net_device *dev; struct net *net; LIST_HEAD(dev_kill_list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { default_device_exit_net(net); cond_resched(); } list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); #ifdef CONFIG_XPS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); #endif #ifdef CONFIG_NETFILTER_EGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); #ifdef CONFIG_NETPOLL CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ #define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) static int net_page_pool_create(int cpuid) { #if IS_ENABLED(CONFIG_PAGE_POOL) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; err = xdp_reg_page_pool(pp_ptr); if (err) { page_pool_destroy(pp_ptr); return err; } per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; } static int backlog_napi_should_run(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); } static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); napi_threaded_poll_loop(&sd->backlog); } static void backlog_napi_setup(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; napi->thread = this_cpu_read(backlog_napi); set_bit(NAPI_STATE_THREADED, &napi->state); } static struct smp_hotplug_thread backlog_threads = { .store = &backlog_napi, .thread_should_run = backlog_napi_should_run, .thread_fn = run_backlog_napi, .thread_comm = "backlog_napi/%u", .setup = backlog_napi_setup, }; /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); net_dev_struct_check(); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ flush_backlogs_fallback = flush_backlogs_alloc(); if (!flush_backlogs_fallback) goto out; for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; /* avoid static key IPIs to isolated CPUs */ if (housekeeping_enabled(HK_TYPE_MISC)) net_enable_timestamp(); out: if (rc < 0) { for_each_possible_cpu(i) { struct page_pool *pp_ptr; pp_ptr = per_cpu(system_page_pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } } return rc; } subsys_initcall(net_dev_init); |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* Copyright (c) 2002-2007 Volkswagen Group Electronic Research * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de> * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #ifndef CAN_ML_H #define CAN_ML_H #include <linux/can.h> #include <linux/list.h> #include <linux/netdevice.h> #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; struct can_dev_rcv_lists { struct hlist_head rx[RX_MAX]; struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; int entries; }; struct can_ml_priv { struct can_dev_rcv_lists dev_rcv_lists; #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) { return netdev_get_ml_priv(dev, ML_PRIV_CAN); } static inline void can_set_ml_priv(struct net_device *dev, struct can_ml_priv *ml_priv) { netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } #endif /* CAN_ML_H */ |
81 536 398 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM maple_tree #if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MM_H #include <linux/tracepoint.h> struct ma_state; TRACE_EVENT(ma_op, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_read, TP_PROTO(const char *fn, struct ma_state *mas), TP_ARGS(fn, mas), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->node = mas->node; ), TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last ) ) TRACE_EVENT(ma_write, TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, void *val), TP_ARGS(fn, mas, piv, val), TP_STRUCT__entry( __field(const char *, fn) __field(unsigned long, min) __field(unsigned long, max) __field(unsigned long, index) __field(unsigned long, last) __field(unsigned long, piv) __field(void *, val) __field(void *, node) ), TP_fast_assign( __entry->fn = fn; __entry->min = mas->min; __entry->max = mas->max; __entry->index = mas->index; __entry->last = mas->last; __entry->piv = piv; __entry->val = val; __entry->node = mas->node; ), TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", __entry->fn, (void *) __entry->node, (unsigned long) __entry->min, (unsigned long) __entry->max, (unsigned long) __entry->index, (unsigned long) __entry->last, (unsigned long) __entry->piv, (void *) __entry->val ) ) #endif /* _TRACE_MM_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
753 908 120 910 286 234 161 1389 145 1202 621 571 465 251 2 8 214 8 210 2 1280 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 | // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-long.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_LONG_H #define _LINUX_ATOMIC_LONG_H #include <linux/compiler.h> #include <asm/types.h> #ifdef CONFIG_64BIT typedef atomic64_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i) #define atomic_long_cond_read_acquire atomic64_cond_read_acquire #define atomic_long_cond_read_relaxed atomic64_cond_read_relaxed #else typedef atomic_t atomic_long_t; #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i) #define atomic_long_cond_read_acquire atomic_cond_read_acquire #define atomic_long_cond_read_relaxed atomic_cond_read_relaxed #endif /** * raw_atomic_long_read() - atomic load with relaxed ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_read() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read(v); #else return raw_atomic_read(v); #endif } /** * raw_atomic_long_read_acquire() - atomic load with acquire ordering * @v: pointer to atomic_long_t * * Atomically loads the value of @v with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere. * * Return: The value loaded from @v. */ static __always_inline long raw_atomic_long_read_acquire(const atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_read_acquire(v); #else return raw_atomic_read_acquire(v); #endif } /** * raw_atomic_long_set() - atomic set with relaxed ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_set() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set(v, i); #else raw_atomic_set(v, i); #endif } /** * raw_atomic_long_set_release() - atomic set with release ordering * @v: pointer to atomic_long_t * @i: long value to assign * * Atomically sets @v to @i with release ordering. * * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_set_release(atomic_long_t *v, long i) { #ifdef CONFIG_64BIT raw_atomic64_set_release(v, i); #else raw_atomic_set_release(v, i); #endif } /** * raw_atomic_long_add() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_add(i, v); #else raw_atomic_add(i, v); #endif } /** * raw_atomic_long_add_return() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return(i, v); #else return raw_atomic_add_return(i, v); #endif } /** * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_acquire(i, v); #else return raw_atomic_add_return_acquire(i, v); #endif } /** * raw_atomic_long_add_return_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_release(i, v); #else return raw_atomic_add_return_release(i, v); #endif } /** * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_return_relaxed(i, v); #else return raw_atomic_add_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add() - atomic add with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add(i, v); #else return raw_atomic_fetch_add(i, v); #endif } /** * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_acquire(i, v); #else return raw_atomic_fetch_add_acquire(i, v); #endif } /** * raw_atomic_long_fetch_add_release() - atomic add with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_release(i, v); #else return raw_atomic_fetch_add_release(i, v); #endif } /** * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_relaxed(i, v); #else return raw_atomic_fetch_add_relaxed(i, v); #endif } /** * raw_atomic_long_sub() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_sub(i, v); #else raw_atomic_sub(i, v); #endif } /** * raw_atomic_long_sub_return() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return(i, v); #else return raw_atomic_sub_return(i, v); #endif } /** * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_acquire(i, v); #else return raw_atomic_sub_return_acquire(i, v); #endif } /** * raw_atomic_long_sub_return_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_release(i, v); #else return raw_atomic_sub_return_release(i, v); #endif } /** * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_return_relaxed(i, v); #else return raw_atomic_sub_return_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_sub() - atomic subtract with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub(i, v); #else return raw_atomic_fetch_sub(i, v); #endif } /** * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_acquire(i, v); #else return raw_atomic_fetch_sub_acquire(i, v); #endif } /** * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_release(i, v); #else return raw_atomic_fetch_sub_release(i, v); #endif } /** * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_sub_relaxed(i, v); #else return raw_atomic_fetch_sub_relaxed(i, v); #endif } /** * raw_atomic_long_inc() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_inc(v); #else raw_atomic_inc(v); #endif } /** * raw_atomic_long_inc_return() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return(v); #else return raw_atomic_inc_return(v); #endif } /** * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_acquire(v); #else return raw_atomic_inc_return_acquire(v); #endif } /** * raw_atomic_long_inc_return_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_release(v); #else return raw_atomic_inc_return_release(v); #endif } /** * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_inc_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_return_relaxed(v); #else return raw_atomic_inc_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_inc() - atomic increment with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc(v); #else return raw_atomic_fetch_inc(v); #endif } /** * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_acquire(v); #else return raw_atomic_fetch_inc_acquire(v); #endif } /** * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_release(v); #else return raw_atomic_fetch_inc_release(v); #endif } /** * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_inc_relaxed(v); #else return raw_atomic_fetch_inc_relaxed(v); #endif } /** * raw_atomic_long_dec() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_dec(v); #else raw_atomic_dec(v); #endif } /** * raw_atomic_long_dec_return() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return(v); #else return raw_atomic_dec_return(v); #endif } /** * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_acquire(v); #else return raw_atomic_dec_return_acquire(v); #endif } /** * raw_atomic_long_dec_return_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_release(v); #else return raw_atomic_dec_return_release(v); #endif } /** * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere. * * Return: The updated value of @v. */ static __always_inline long raw_atomic_long_dec_return_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_return_relaxed(v); #else return raw_atomic_dec_return_relaxed(v); #endif } /** * raw_atomic_long_fetch_dec() - atomic decrement with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec(v); #else return raw_atomic_fetch_dec(v); #endif } /** * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_acquire(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_acquire(v); #else return raw_atomic_fetch_dec_acquire(v); #endif } /** * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_release(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_release(v); #else return raw_atomic_fetch_dec_release(v); #endif } /** * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_dec_relaxed(v); #else return raw_atomic_fetch_dec_relaxed(v); #endif } /** * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_and() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_and(i, v); #else raw_atomic_and(i, v); #endif } /** * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and(i, v); #else return raw_atomic_fetch_and(i, v); #endif } /** * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_acquire(i, v); #else return raw_atomic_fetch_and_acquire(i, v); #endif } /** * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_release(i, v); #else return raw_atomic_fetch_and_release(i, v); #endif } /** * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_and_relaxed(i, v); #else return raw_atomic_fetch_and_relaxed(i, v); #endif } /** * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_andnot(i, v); #else raw_atomic_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot(i, v); #else return raw_atomic_fetch_andnot(i, v); #endif } /** * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_acquire(i, v); #else return raw_atomic_fetch_andnot_acquire(i, v); #endif } /** * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_release(i, v); #else return raw_atomic_fetch_andnot_release(i, v); #endif } /** * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v & ~@i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_andnot_relaxed(i, v); #else return raw_atomic_fetch_andnot_relaxed(i, v); #endif } /** * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_or() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_or(i, v); #else raw_atomic_or(i, v); #endif } /** * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or(i, v); #else return raw_atomic_fetch_or(i, v); #endif } /** * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_acquire(i, v); #else return raw_atomic_fetch_or_acquire(i, v); #endif } /** * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_release(i, v); #else return raw_atomic_fetch_or_release(i, v); #endif } /** * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v | @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_or_relaxed(i, v); #else return raw_atomic_fetch_or_relaxed(i, v); #endif } /** * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere. * * Return: Nothing. */ static __always_inline void raw_atomic_long_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT raw_atomic64_xor(i, v); #else raw_atomic_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor(i, v); #else return raw_atomic_fetch_xor(i, v); #endif } /** * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_acquire(i, v); #else return raw_atomic_fetch_xor_acquire(i, v); #endif } /** * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_release(i, v); #else return raw_atomic_fetch_xor_release(i, v); #endif } /** * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering * @i: long value * @v: pointer to atomic_long_t * * Atomically updates @v to (@v ^ @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_xor_relaxed(i, v); #else return raw_atomic_fetch_xor_relaxed(i, v); #endif } /** * raw_atomic_long_xchg() - atomic exchange with full ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with full ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg(v, new); #else return raw_atomic_xchg(v, new); #endif } /** * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_acquire(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_acquire(v, new); #else return raw_atomic_xchg_acquire(v, new); #endif } /** * raw_atomic_long_xchg_release() - atomic exchange with release ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with release ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_release(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_release(v, new); #else return raw_atomic_xchg_release(v, new); #endif } /** * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering * @v: pointer to atomic_long_t * @new: long value to assign * * Atomically updates @v to @new with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) { #ifdef CONFIG_64BIT return raw_atomic64_xchg_relaxed(v, new); #else return raw_atomic_xchg_relaxed(v, new); #endif } /** * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg(v, old, new); #else return raw_atomic_cmpxchg(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_acquire(v, old, new); #else return raw_atomic_cmpxchg_acquire(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_release(v, old, new); #else return raw_atomic_cmpxchg_release(v, old, new); #endif } /** * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_cmpxchg_relaxed(v, old, new); #else return raw_atomic_cmpxchg_relaxed(v, old, new); #endif } /** * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_release(v, (int *)old, new); #endif } /** * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering * @v: pointer to atomic_long_t * @old: pointer to long value to compare with * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. * Otherwise, @v is not modified, @old is updated to the current value of @v, * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * * Return: @true if the exchange occured, @false otherwise. */ static __always_inline bool raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { #ifdef CONFIG_64BIT return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new); #else return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new); #endif } /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_sub_and_test(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_sub_and_test(i, v); #else return raw_atomic_sub_and_test(i, v); #endif } /** * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_and_test(v); #else return raw_atomic_dec_and_test(v); #endif } /** * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + 1) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere. * * Return: @true if the resulting value of @v is zero, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_and_test(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_and_test(v); #else return raw_atomic_inc_and_test(v); #endif } /** * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with full ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative(i, v); #else return raw_atomic_add_negative(i, v); #endif } /** * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with acquire ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_acquire(i, v); #else return raw_atomic_add_negative_acquire(i, v); #endif } /** * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with release ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_release(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_release(i, v); #else return raw_atomic_add_negative_release(i, v); #endif } /** * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering * @i: long value to add * @v: pointer to atomic_long_t * * Atomically updates @v to (@v + @i) with relaxed ordering. * * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere. * * Return: @true if the resulting value of @v is negative, @false otherwise. */ static __always_inline bool raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_add_negative_relaxed(i, v); #else return raw_atomic_add_negative_relaxed(i, v); #endif } /** * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * * Return: The original value of @v. */ static __always_inline long raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_fetch_add_unless(v, a, u); #else return raw_atomic_fetch_add_unless(v, a, u); #endif } /** * raw_atomic_long_add_unless() - atomic add unless value with full ordering * @v: pointer to atomic_long_t * @a: long value to add * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) { #ifdef CONFIG_64BIT return raw_atomic64_add_unless(v, a, u); #else return raw_atomic_add_unless(v, a, u); #endif } /** * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_not_zero(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_not_zero(v); #else return raw_atomic_inc_not_zero(v); #endif } /** * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_inc_unless_negative(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_inc_unless_negative(v); #else return raw_atomic_inc_unless_negative(v); #endif } /** * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * * Return: @true if @v was updated, @false otherwise. */ static __always_inline bool raw_atomic_long_dec_unless_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_unless_positive(v); #else return raw_atomic_dec_unless_positive(v); #endif } /** * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * * Return: The old value of (@v - 1), regardless of whether @v was updated. */ static __always_inline long raw_atomic_long_dec_if_positive(atomic_long_t *v) { #ifdef CONFIG_64BIT return raw_atomic64_dec_if_positive(v); #else return raw_atomic_dec_if_positive(v); #endif } #endif /* _LINUX_ATOMIC_LONG_H */ // eadf183c3600b8b92b91839dd3be6bcc560c752d |
8 7 7 7 4 3 9 2 8 8 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb_pull(skb, llc_len); skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler); |
187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ARM64_KVM_NESTED_H #define __ARM64_KVM_NESTED_H #include <linux/bitfield.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_pgtable.h> static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) { return (!__is_defined(__KVM_NVHE_HYPERVISOR__) && cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2)); } /* Translation helpers from non-VHE EL2 to EL1 */ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) { return (u64)FIELD_GET(TCR_EL2_PS_MASK, tcr_el2) << TCR_IPS_SHIFT; } static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | (tcr & TCR_EL2_ORGN0_MASK) | (tcr & TCR_EL2_IRGN0_MASK) | (tcr & TCR_EL2_T0SZ_MASK); } static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2) { u64 cpacr_el1 = CPACR_EL1_RES1; if (cptr_el2 & CPTR_EL2_TTA) cpacr_el1 |= CPACR_EL1_TTA; if (!(cptr_el2 & CPTR_EL2_TFP)) cpacr_el1 |= CPACR_EL1_FPEN; if (!(cptr_el2 & CPTR_EL2_TZ)) cpacr_el1 |= CPACR_EL1_ZEN; cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM); return cpacr_el1; } static inline u64 translate_sctlr_el2_to_sctlr_el1(u64 val) { /* Only preserve the minimal set of bits we support */ val &= (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | SCTLR_ELx_WXN | SCTLR_ELx_EE); val |= SCTLR_EL1_RES1; return val; } static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) { /* Clear the ASID field */ return ttbr0 & ~GENMASK_ULL(63, 48); } extern bool forward_smc_trap(struct kvm_vcpu *vcpu); extern bool forward_debug_exception(struct kvm_vcpu *vcpu); extern void kvm_init_nested(struct kvm *kvm); extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu); union tlbi_info; extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, const union tlbi_info *info, void (*)(struct kvm_s2_mmu *, const union tlbi_info *)); extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); struct kvm_s2_trans { phys_addr_t output; unsigned long block_size; bool writable; bool readable; int level; u32 esr; u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) { return trans->output; } static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans) { return trans->block_size; } static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans) { return trans->esr; } static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans) { return trans->readable; } static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) { return trans->writable; } static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, struct kvm_s2_trans *result); extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans); extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); extern void kvm_nested_s2_wp(struct kvm *kvm); extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); extern void kvm_nested_s2_flush(struct kvm *kvm); unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL1)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (!(sys_reg_Op0(instr) == TLBI_Op0 && sys_reg_Op1(instr) == TLBI_Op1_EL2)) return false; if (!(sys_reg_CRn(instr) == TLBI_CRn_XS || (sys_reg_CRn(instr) == TLBI_CRn_nXS && kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)))) return false; if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS || CRm == TLBI_CRm_RNS) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); #ifdef CONFIG_ARM64_PTR_AUTH bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); #else static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr) { /* We really should never execute this... */ WARN_ON_ONCE(1); *elr = 0xbad9acc0debadbad; return false; } #endif #define KVM_NV_GUEST_MAP_SZ (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0) static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) { return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } /* Adjust alignment for the contiguous bit as per StageOA() */ #define contiguous_bit_shift(d, wi, l) \ ({ \ u8 shift = 0; \ \ if ((d) & PTE_CONT) { \ switch (BIT((wi)->pgshift)) { \ case SZ_4K: \ shift = 4; \ break; \ case SZ_16K: \ shift = (l) == 2 ? 5 : 7; \ break; \ case SZ_64K: \ shift = 5; \ break; \ } \ } \ \ shift; \ }) static inline unsigned int ps_to_output_size(unsigned int ps) { switch (ps) { case 0: return 32; case 1: return 36; case 2: return 40; case 3: return 42; case 4: return 44; case 5: default: return 48; } } #endif /* __ARM64_KVM_NESTED_H */ |
46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ARM64_KVM_HYP_FAULT_H__ #define __ARM64_KVM_HYP_FAULT_H__ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { int ret; u64 par, tmp; /* * Resolve the IPA the hard way using the guest VA. * * Stage-1 translation already validated the memory access * rights. As such, we can use the EL1 translation regime, and * don't have to distinguish between EL0 and EL1 access. * * We do need to save/restore PAR_EL1 though, as we haven't * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : __kvm_at(OP_AT_S1E1R, far); if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ write_sysreg(par, par_el1); if (unlikely(tmp & SYS_PAR_EL1_F)) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ *hpfar = PAR_TO_HPFAR(tmp); return true; } static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) { u64 hpfar, far; far = read_sysreg_el2(SYS_FAR); /* * The HPFAR can be invalid if the stage 2 fault did not * happen during a stage 1 page table walk (the ESR_EL2.S1PTW * bit is clear) and one of the two following cases are true: * 1. The fault was due to a permission fault * 2. The processor carries errata 834220 * * Therefore, for all non S1PTW faults where we either have a * permission fault or the errata workaround is enabled, we * resolve the IPA using the AT instruction. */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || esr_fsc_is_permission_fault(esr))) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { hpfar = read_sysreg(hpfar_el2); } fault->far_el2 = far; fault->hpfar_el2 = hpfar; return true; } #endif |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __KVM_ARM_PSCI_H__ #define __KVM_ARM_PSCI_H__ #include <linux/kvm_host.h> #include <uapi/linux/psci.h> #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) #define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2) #define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3) #define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_3 static inline int kvm_psci_version(struct kvm_vcpu *vcpu) { /* * Our PSCI implementation stays the same across versions from * v0.2 onward, only adding the few mandatory functions (such * as FEATURES with 1.0) that are required by newer * revisions. It is thus safe to return the latest, unless * userspace has instructed us otherwise. */ if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) { if (vcpu->kvm->arch.psci_version) return vcpu->kvm->arch.psci_version; return KVM_ARM_PSCI_LATEST; } return KVM_ARM_PSCI_0_1; } int kvm_psci_call(struct kvm_vcpu *vcpu); #endif /* __KVM_ARM_PSCI_H__ */ |
1 1 1 1 4 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | // SPDX-License-Identifier: GPL-2.0-only /* * VGIC system registers handling functions for AArch64 mode */ #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include "vgic/vgic.h" #include "sys_regs.h" static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); /* * Disallow restoring VM state if not supported by this * hardware. */ host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1; if (host_pri_bits > vgic_v3_cpu->num_pri_bits) return -EINVAL; vgic_v3_cpu->num_pri_bits = host_pri_bits; host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val); if (host_id_bits > vgic_v3_cpu->num_id_bits) return -EINVAL; vgic_v3_cpu->num_id_bits = host_id_bits; host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); if (host_seis != seis) return -EINVAL; host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); if (host_a3v != a3v) return -EINVAL; /* * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. * The vgic_set_vmcr() will convert to ICH_VMCR layout. */ vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val); vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; } static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *valp) { struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; struct vgic_vmcr vmcr; u64 val; vgic_get_vmcr(vcpu, &vmcr); val = 0; val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2)); val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); /* * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. */ val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr); val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim); *valp = val; return 0; } static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; } static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr); return 0; } static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; } static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr); return 0; } static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) { vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); } return 0; } static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr); else *val = min((vmcr.bpr + 1), 7U); return 0; } static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; } static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0); return 0; } static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; } static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1); return 0; } static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; if (apr) vgicv3->vgic_ap1r[idx] = val; else vgicv3->vgic_ap0r[idx] = val; } static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; if (apr) return vgicv3->vgic_ap1r[idx]; else return vgicv3->vgic_ap0r[idx]; } static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) return -EINVAL; set_apr_reg(vcpu, val, 0, idx); return 0; } static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) return -EINVAL; *val = get_apr_reg(vcpu, 0, idx); return 0; } static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) return -EINVAL; set_apr_reg(vcpu, val, 1, idx); return 0; } static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) return -EINVAL; *val = get_apr_reg(vcpu, 1, idx); return 0; } static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { /* Validate SRE bit */ if (!(val & ICC_SRE_EL1_SRE)) return -EINVAL; return 0; } static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; *val = vgicv3->vgic_sre; return 0; } static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { { SYS_DESC(SYS_ICC_PMR_EL1), .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, { SYS_DESC(SYS_ICC_BPR0_EL1), .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, }, { SYS_DESC(SYS_ICC_AP0R0_EL1), .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, { SYS_DESC(SYS_ICC_AP0R1_EL1), .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, { SYS_DESC(SYS_ICC_AP0R2_EL1), .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, { SYS_DESC(SYS_ICC_AP0R3_EL1), .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, { SYS_DESC(SYS_ICC_AP1R0_EL1), .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, { SYS_DESC(SYS_ICC_AP1R1_EL1), .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, { SYS_DESC(SYS_ICC_AP1R2_EL1), .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, { SYS_DESC(SYS_ICC_AP1R3_EL1), .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, { SYS_DESC(SYS_ICC_BPR1_EL1), .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, }, { SYS_DESC(SYS_ICC_CTLR_EL1), .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, }, { SYS_DESC(SYS_ICC_SRE_EL1), .set_user = set_gic_sre, .get_user = get_gic_sre, }, { SYS_DESC(SYS_ICC_IGRPEN0_EL1), .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, { SYS_DESC(SYS_ICC_IGRPEN1_EL1), .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, }; static u64 attr_to_id(u64 attr) { return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr), FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr), FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr), FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr)); } int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs))) return 0; return -ENXIO; } int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write) { struct kvm_one_reg reg = { .id = attr_to_id(attr->attr), .addr = attr->addr, }; if (is_write) return kvm_sys_reg_set_user(vcpu, ®, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs)); else return kvm_sys_reg_get_user(vcpu, ®, gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs)); } |
1495 1497 1495 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/llist.h> #include <asm/memory.h> #include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/sdei.h> #include <asm/stacktrace/common.h> extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); static inline struct stack_info stackinfo_get_irq(void) { unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); unsigned long high = low + IRQ_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline bool on_irq_stack(unsigned long sp, unsigned long size) { struct stack_info info = stackinfo_get_irq(); return stackinfo_on_stack(&info, sp, size); } static inline struct stack_info stackinfo_get_task(const struct task_struct *tsk) { unsigned long low = (unsigned long)task_stack_page(tsk); unsigned long high = low + THREAD_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline bool on_task_stack(const struct task_struct *tsk, unsigned long sp, unsigned long size) { struct stack_info info = stackinfo_get_task(tsk); return stackinfo_on_stack(&info, sp, size); } #define on_thread_stack() (on_task_stack(current, current_stack_pointer, 1)) #ifdef CONFIG_VMAP_STACK DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); static inline struct stack_info stackinfo_get_overflow(void) { unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); unsigned long high = low + OVERFLOW_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #else #define stackinfo_get_overflow() stackinfo_get_unknown() #endif #if defined(CONFIG_ARM_SDE_INTERFACE) && defined(CONFIG_VMAP_STACK) DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); static inline struct stack_info stackinfo_get_sdei_normal(void) { unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); unsigned long high = low + SDEI_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } static inline struct stack_info stackinfo_get_sdei_critical(void) { unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); unsigned long high = low + SDEI_STACK_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #else #define stackinfo_get_sdei_normal() stackinfo_get_unknown() #define stackinfo_get_sdei_critical() stackinfo_get_unknown() #endif #ifdef CONFIG_EFI extern u64 *efi_rt_stack_top; static inline struct stack_info stackinfo_get_efi(void) { unsigned long high = (u64)efi_rt_stack_top; unsigned long low = high - THREAD_SIZE; return (struct stack_info) { .low = low, .high = high, }; } #endif #endif /* __ASM_STACKTRACE_H */ |
314 294 294 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | /* SPDX-License-Identifier: GPL-2.0 */ /* * net/dst.h Protocol independent destination cache definitions. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #ifndef _NET_DST_H #define _NET_DST_H #include <net/dst_ops.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <linux/rcuref.h> #include <net/neighbour.h> #include <asm/processor.h> #include <linux/indirect_call_wrapper.h> struct sk_buff; struct dst_entry { struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOCOUNT 0x0008 #define DST_FAKE_RTABLE 0x0010 #define DST_XFRM_TUNNEL 0x0020 #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ /* * __rcuref wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT rcuref_t __rcuref; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT struct lwtunnel_state *lwtstate; rcuref_t __rcuref; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; /* * Used by rtable and rt6_info. Moves lwtstate into the next cache * line on 64bit so that lwtstate does not cause false sharing with * __rcuref under contention of __rcuref. This also puts the * frequently accessed members of rtable and rt6_info out of the * __rcuref cache line. */ struct list_head rt_uncached; struct uncached_list *rt_uncached_list; #ifdef CONFIG_64BIT struct lwtunnel_state *lwtstate; #endif }; struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; } __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); #define DST_METRICS_READ_ONLY 0x1UL #define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) static inline bool dst_metrics_read_only(const struct dst_entry *dst) { return dst->_metrics & DST_METRICS_READ_ONLY; } void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); static inline void dst_destroy_metrics_generic(struct dst_entry *dst) { unsigned long val = dst->_metrics; if (!(val & DST_METRICS_READ_ONLY)) __dst_destroy_metrics_generic(dst, val); } static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) { unsigned long p = dst->_metrics; BUG_ON(!p); if (p & DST_METRICS_READ_ONLY) return dst->ops->cow_metrics(dst, p); return __DST_METRICS_PTR(p); } /* This may only be invoked before the entry has reached global * visibility. */ static inline void dst_init_metrics(struct dst_entry *dst, const u32 *src_metrics, bool read_only) { dst->_metrics = ((unsigned long) src_metrics) | (read_only ? DST_METRICS_READ_ONLY : 0); } static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) { u32 *dst_metrics = dst_metrics_write_ptr(dest); if (dst_metrics) { u32 *src_metrics = DST_METRICS_PTR(src); memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); } } static inline u32 *dst_metrics_ptr(struct dst_entry *dst) { return DST_METRICS_PTR(dst); } static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { u32 *p = DST_METRICS_PTR(dst); return p[metric-1]; } static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { WARN_ON_ONCE(metric == RTAX_HOPLIMIT || metric == RTAX_ADVMSS || metric == RTAX_MTU); return dst_metric_raw(dst, metric); } static inline u32 dst_metric_advmss(const struct dst_entry *dst) { u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (!advmss) advmss = dst->ops->default_advmss(dst); return advmss; } static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { u32 *p = dst_metrics_write_ptr(dst); if (p) p[metric-1] = val; } /* Kernel-internal feature bits that are unallocated in user space. */ #define DST_FEATURE_ECN_CA (1U << 31) #define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) #define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { return dst_metric(dst, RTAX_FEATURES) & feature; } INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { return msecs_to_jiffies(dst_metric(dst, metric)); } static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check * the placement of __rcuref in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); WARN_ON(!rcuref_get(&dst->__rcuref)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) dst_hold(dst); return dst; } void dst_release(struct dst_entry *dst); void dst_release_immediate(struct dst_entry *dst); static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); } /** * skb_dst_drop - drops skb dst * @skb: buffer * * Drops dst reference count if a reference was taken. */ static inline void skb_dst_drop(struct sk_buff *skb) { if (skb->_skb_refdst) { refdst_drop(skb->_skb_refdst); skb->_skb_refdst = 0UL; } } static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) { __skb_dst_copy(nskb, oskb->_skb_refdst); } /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry * * This helper returns false if it could not safely * take a reference on a dst. */ static inline bool dst_hold_safe(struct dst_entry *dst) { return rcuref_get(&dst->__rcuref); } /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; } /** * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { skb->dev = dev; /* * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ skb_clear_hash_if_not_l4(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** * skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { DEV_STATS_INC(dev, rx_packets); DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } static inline u32 dst_tclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID const struct dst_entry *dst; dst = skb_dst(skb); if (dst) return dst->tclassid; #endif return 0; } int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { struct neighbour *n; if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) return NULL; n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } static inline void dst_confirm_neigh(const struct dst_entry *dst, const void *daddr) { if (dst->ops->confirm_neigh) dst->ops->confirm_neigh(dst, daddr); } static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops && dst->ops->link_failure) dst->ops->link_failure(skb); } static inline void dst_set_expires(struct dst_entry *dst, int timeout) { unsigned long expires = jiffies + timeout; if (expires == 0) expires = 1; if (dst->expires == 0 || time_before(expires, dst->expires)) dst->expires = expires; } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, struct sk_buff *skb) { if (likely(dst)) return LL_RESERVED_SPACE(dst->dev); return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->output, ip6_output, ip_output, net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->input, ip6_input, ip_local_deliver, skb); } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; } /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; #ifndef CONFIG_XFRM static inline struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct dst_entry * xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { return dst_orig; } static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return NULL; } #else struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id); struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); /* skb attached with this dst needs transformation if dst->xfrm is valid */ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return dst->xfrm; } #endif static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } /* update dst pmtu but not do neighbor confirm */ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); unsigned int dst_blackhole_mtu(const struct dst_entry *dst); #endif /* _NET_DST_H */ |
1 48 49 1 40 39 9 9 60 7 4 2 2 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/include/kvm_emulate.h * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #ifndef __ARM64_KVM_EMULATE_H__ #define __ARM64_KVM_EMULATE_H__ #include <linux/bitfield.h> #include <linux/kvm_host.h> #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> #include <asm/ptrace.h> #include <asm/cputype.h> #include <asm/virt.h> #define CURRENT_EL_SP_EL0_VECTOR 0x0 #define CURRENT_EL_SP_ELx_VECTOR 0x200 #define LOWER_EL_AArch64_VECTOR 0x400 #define LOWER_EL_AArch32_VECTOR 0x600 enum exception_type { except_type_sync = 0, except_type_irq = 0x80, except_type_fiq = 0x100, except_type_serror = 0x180, }; #define kvm_exception_type_names \ { except_type_sync, "SYNC" }, \ { except_type_irq, "IRQ" }, \ { except_type_fiq, "FIQ" }, \ { except_type_serror, "SERROR" } bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_size_fault(struct kvm_vcpu *vcpu); void kvm_vcpu_wfi(struct kvm_vcpu *vcpu); void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu) { u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SVE) | ESR_ELx_IL; kvm_inject_nested_sync(vcpu, esr); } #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__) static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return !(vcpu->arch.hcr_el2 & HCR_RW); } #else static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { return vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); } #endif static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { if (!vcpu_has_run_once(vcpu)) vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; /* * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C * get set in SCTLR_EL1 such that we can detect when the guest * MMU gets turned on and do the necessary cache maintenance * then. */ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) vcpu->arch.hcr_el2 |= HCR_TVM; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu->arch.hcr_el2; } static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 &= ~HCR_TWE; if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || vcpu->kvm->arch.vgic.nassgireq) vcpu->arch.hcr_el2 &= ~HCR_TWI; else vcpu->arch.hcr_el2 |= HCR_TWI; } static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 |= HCR_TWE; vcpu->arch.hcr_el2 |= HCR_TWI; } static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) { return vcpu->arch.vsesr_el2; } static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr) { vcpu->arch.vsesr_el2 = vsesr; } static __always_inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu_gp_regs(vcpu)->pc; } static __always_inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) { return (unsigned long *)&vcpu_gp_regs(vcpu)->pstate; } static __always_inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) { return !!(*vcpu_cpsr(vcpu) & PSR_MODE32_BIT); } static __always_inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) return kvm_condition_valid32(vcpu); return true; } static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; } /* * vcpu_get_reg and vcpu_set_reg should always be passed a register number * coming from a read of ESR_EL2. Otherwise, it may give the wrong result on * AArch32 with banked registers. */ static __always_inline unsigned long vcpu_get_reg(const struct kvm_vcpu *vcpu, u8 reg_num) { return (reg_num == 31) ? 0 : vcpu_gp_regs(vcpu)->regs[reg_num]; } static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, unsigned long val) { if (reg_num != 31) vcpu_gp_regs(vcpu)->regs[reg_num] = val; } static inline bool vcpu_is_el2_ctxt(const struct kvm_cpu_context *ctxt) { switch (ctxt->regs.pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) { case PSR_MODE_EL2h: case PSR_MODE_EL2t: return true; default: return false; } } static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) { return vcpu_is_el2_ctxt(&vcpu->arch.ctxt); } static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) { return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_E2H)); } static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) { return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE; } static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) { bool e2h, tge; u64 hcr; if (!vcpu_has_nv(vcpu)) return false; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); e2h = (hcr & HCR_E2H); tge = (hcr & HCR_TGE); /* * We are in a hypervisor context if the vcpu mode is EL2 or * E2H and TGE bits are set. The latter means we are in the user space * of the VHE kernel. ARMv8.1 ARM describes this as 'InHost' * * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the * rest of the KVM code, and will result in a misbehaving guest. */ return vcpu_is_el2(vcpu) || (e2h && tge) || tge; } static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) { return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu); } /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 * view given an AArch64 view. * * In ARM DDI 0487E.a see: * * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426 * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256 * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280 * * Which show the following differences: * * | Bit | AA64 | AA32 | Notes | * +-----+------+------+-----------------------------| * | 24 | DIT | J | J is RES0 in ARMv8 | * | 21 | SS | DIT | SS doesn't exist in AArch32 | * * ... and all other bits are (currently) common. */ static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) { const unsigned long overlap = BIT(24) | BIT(21); unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT); spsr &= ~overlap; spsr |= dit << 21; return spsr; } static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) { u32 mode; if (vcpu_mode_is_32bit(vcpu)) { mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK; return mode > PSR_AA32_MODE_USR; } mode = *vcpu_cpsr(vcpu) & PSR_MODE_MASK; return mode != PSR_MODE_EL0t; } static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.esr_el2; } static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); if (esr & ESR_ELx_CV) return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; return -1; } static __always_inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.far_el2; } static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu) { return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8; } static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.disr_el1; } static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_xVC_IMM_MASK; } static __always_inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_ISV); } static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & (ESR_ELx_CM | ESR_ELx_WNR | ESR_ELx_FSC); } static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SSE); } static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SF); } static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; } static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW); } /* Always check for S1PTW *before* using this. */ static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR; } static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_CM); } static __always_inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) { return 1 << ((kvm_vcpu_get_esr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT); } /* This one is not specific to Data Abort */ static __always_inline bool kvm_vcpu_trap_il_is32bit(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_IL); } static __always_inline u8 kvm_vcpu_trap_get_class(const struct kvm_vcpu *vcpu) { return ESR_ELx_EC(kvm_vcpu_get_esr(vcpu)); } static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu) { return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW; } static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) { return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); } static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; } static inline bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu) { return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu)); } static inline bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu) { return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu)); } static inline u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu) { unsigned long esr = kvm_vcpu_get_esr(vcpu); BUG_ON(!esr_fsc_is_permission_fault(esr)); return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { case ESR_ELx_FSC_EXTABT: case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3): case ESR_ELx_FSC_SECC: case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3): return true; default: return false; } } static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); return ESR_ELx_SYS64_ISS_RT(esr); } static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) { if (kvm_vcpu_abt_iss1tw(vcpu)) { /* * Only a permission fault on a S1PTW should be * considered as a write. Otherwise, page tables baked * in a read-only memslot will result in an exception * being delivered in the guest. * * The drawback is that we end-up faulting twice if the * guest is using any of HW AF/DB: a translation fault * to map the page containing the PT (read only at * first), then a permission fault to allow the flags * to be set. */ return kvm_vcpu_trap_is_permission_fault(vcpu); } if (kvm_vcpu_trap_is_iabt(vcpu)) return false; return kvm_vcpu_dabt_iswrite(vcpu); } static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return __vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; } static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { *vcpu_cpsr(vcpu) |= PSR_AA32_E_BIT; } else { u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); sctlr |= SCTLR_ELx_EE; vcpu_write_sys_reg(vcpu, sctlr, SCTLR_EL1); } } static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT); if (vcpu_mode_priv(vcpu)) return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE); else return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E); } static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, unsigned long data, unsigned int len) { if (kvm_vcpu_is_be(vcpu)) { switch (len) { case 1: return data & 0xff; case 2: return be16_to_cpu(data & 0xffff); case 4: return be32_to_cpu(data & 0xffffffff); default: return be64_to_cpu(data); } } else { switch (len) { case 1: return data & 0xff; case 2: return le16_to_cpu(data & 0xffff); case 4: return le32_to_cpu(data & 0xffffffff); default: return le64_to_cpu(data); } } return data; /* Leave LE untouched */ } static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, unsigned long data, unsigned int len) { if (kvm_vcpu_is_be(vcpu)) { switch (len) { case 1: return data & 0xff; case 2: return cpu_to_be16(data & 0xffff); case 4: return cpu_to_be32(data & 0xffffffff); default: return cpu_to_be64(data); } } else { switch (len) { case 1: return data & 0xff; case 2: return cpu_to_le16(data & 0xffff); case 4: return cpu_to_le32(data & 0xffffffff); default: return cpu_to_le64(data); } } return data; /* Leave LE untouched */ } static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); vcpu_set_flag(vcpu, INCREMENT_PC); } #define kvm_pend_exception(v, e) \ do { \ WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ vcpu_set_flag((v), PENDING_EXCEPTION); \ vcpu_set_flag((v), e); \ } while (0) #define __build_check_all_or_none(r, bits) \ BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) #define __cpacr_to_cptr_clr(clr, set) \ ({ \ u64 cptr = 0; \ \ if ((set) & CPACR_EL1_FPEN) \ cptr |= CPTR_EL2_TFP; \ if ((set) & CPACR_EL1_ZEN) \ cptr |= CPTR_EL2_TZ; \ if ((set) & CPACR_EL1_SMEN) \ cptr |= CPTR_EL2_TSM; \ if ((clr) & CPACR_EL1_TTA) \ cptr |= CPTR_EL2_TTA; \ if ((clr) & CPTR_EL2_TAM) \ cptr |= CPTR_EL2_TAM; \ if ((clr) & CPTR_EL2_TCPAC) \ cptr |= CPTR_EL2_TCPAC; \ \ cptr; \ }) #define __cpacr_to_cptr_set(clr, set) \ ({ \ u64 cptr = 0; \ \ if ((clr) & CPACR_EL1_FPEN) \ cptr |= CPTR_EL2_TFP; \ if ((clr) & CPACR_EL1_ZEN) \ cptr |= CPTR_EL2_TZ; \ if ((clr) & CPACR_EL1_SMEN) \ cptr |= CPTR_EL2_TSM; \ if ((set) & CPACR_EL1_TTA) \ cptr |= CPTR_EL2_TTA; \ if ((set) & CPTR_EL2_TAM) \ cptr |= CPTR_EL2_TAM; \ if ((set) & CPTR_EL2_TCPAC) \ cptr |= CPTR_EL2_TCPAC; \ \ cptr; \ }) #define cpacr_clear_set(clr, set) \ do { \ BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \ __build_check_all_or_none((clr), CPACR_EL1_FPEN); \ __build_check_all_or_none((set), CPACR_EL1_FPEN); \ __build_check_all_or_none((clr), CPACR_EL1_ZEN); \ __build_check_all_or_none((set), CPACR_EL1_ZEN); \ __build_check_all_or_none((clr), CPACR_EL1_SMEN); \ __build_check_all_or_none((set), CPACR_EL1_SMEN); \ \ if (has_vhe() || has_hvhe()) \ sysreg_clear_set(cpacr_el1, clr, set); \ else \ sysreg_clear_set(cptr_el2, \ __cpacr_to_cptr_clr(clr, set), \ __cpacr_to_cptr_set(clr, set));\ } while (0) /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. */ static inline u64 vcpu_sanitised_cptr_el2(const struct kvm_vcpu *vcpu) { u64 cptr = __vcpu_sys_reg(vcpu, CPTR_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) cptr = translate_cptr_el2_to_cpacr_el1(cptr); return cptr; } static inline bool ____cptr_xen_trap_enabled(const struct kvm_vcpu *vcpu, unsigned int xen) { switch (xen) { case 0b00: case 0b10: return true; case 0b01: return vcpu_el2_tge_is_set(vcpu) && !vcpu_is_el2(vcpu); case 0b11: default: return false; } } #define __guest_hyp_cptr_xen_trap_enabled(vcpu, xen) \ (!vcpu_has_nv(vcpu) ? false : \ ____cptr_xen_trap_enabled(vcpu, \ SYS_FIELD_GET(CPACR_EL1, xen, \ vcpu_sanitised_cptr_el2(vcpu)))) static inline bool guest_hyp_fpsimd_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, FPEN); } static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } #endif /* __ARM64_KVM_EMULATE_H__ */ |
2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 | // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); return -EINVAL; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt) { if (!dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } return 0; } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); oper = rtnl_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rtnl_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; u32 taprio_flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL; } if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } /* Not going to race against advance_sched(), but still */ admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; rcu_read_lock(); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error_rcu; if (oper && dump_schedule(skb, oper)) goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error_rcu: rcu_read_unlock(); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc"); |
45 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BEN_VLAN_802_1Q_INC__ #define __BEN_VLAN_802_1Q_INC__ #include <linux/if_vlan.h> #include <linux/u64_stats_sync.h> #include <linux/list.h> /* if this changes, algorithm will have to be reworked because this * depends on completely exhausting the VLAN identifier space. Thus * it gives constant time look-up, but in many cases it wastes memory. */ #define VLAN_GROUP_ARRAY_SPLIT_PARTS 8 #define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS) enum vlan_protos { VLAN_PROTO_8021Q = 0, VLAN_PROTO_8021AD, VLAN_PROTO_NUM, }; struct vlan_group { unsigned int nr_vlan_devs; struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM] [VLAN_GROUP_ARRAY_SPLIT_PARTS]; }; struct vlan_info { struct net_device *real_dev; /* The ethernet(like) device * the vlan is attached to. */ struct vlan_group grp; struct list_head vid_list; unsigned int nr_vids; struct rcu_head rcu; }; static inline int vlan_proto_idx(__be16 proto) { switch (proto) { case htons(ETH_P_8021Q): return VLAN_PROTO_8021Q; case htons(ETH_P_8021AD): return VLAN_PROTO_8021AD; default: WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto)); return -EINVAL; } } static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, unsigned int pidx, u16 vlan_id) { struct net_device **array; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; /* paired with smp_wmb() in vlan_group_prealloc_vid() */ smp_rmb(); return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { int pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return NULL; return __vlan_group_get_device(vg, pidx, vlan_id); } static inline void vlan_group_set_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id, struct net_device *dev) { int pidx = vlan_proto_idx(vlan_proto); struct net_device **array; if (!vg || pidx < 0) return; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev; } /* Must be invoked with rcu_read_lock or with RTNL. */ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info); if (vlan_info) return vlan_group_get_device(&vlan_info->grp, vlan_proto, vlan_id); return NULL; } static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) { netdev_features_t ret; ret = real_dev->hw_enc_features & (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; return 0; } #define vlan_group_for_each_dev(grp, i, dev) \ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ (i) % VLAN_N_VID))) int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto); void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto); /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); void vlan_dev_free_egress_priority(const struct net_device *dev); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); static inline u32 vlan_get_ingress_priority(struct net_device *dev, u16 vlan_tci) { struct vlan_dev_priv *vip = vlan_dev_priv(dev); return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7]; } #ifdef CONFIG_VLAN_8021Q_GVRP int vlan_gvrp_request_join(const struct net_device *dev); void vlan_gvrp_request_leave(const struct net_device *dev); int vlan_gvrp_init_applicant(struct net_device *dev); void vlan_gvrp_uninit_applicant(struct net_device *dev); int vlan_gvrp_init(void); void vlan_gvrp_uninit(void); #else static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_gvrp_request_leave(const struct net_device *dev) {} static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_gvrp_init(void) { return 0; } static inline void vlan_gvrp_uninit(void) {} #endif #ifdef CONFIG_VLAN_8021Q_MVRP int vlan_mvrp_request_join(const struct net_device *dev); void vlan_mvrp_request_leave(const struct net_device *dev); int vlan_mvrp_init_applicant(struct net_device *dev); void vlan_mvrp_uninit_applicant(struct net_device *dev); int vlan_mvrp_init(void); void vlan_mvrp_uninit(void); #else static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_mvrp_request_leave(const struct net_device *dev) {} static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_mvrp_init(void) { return 0; } static inline void vlan_mvrp_uninit(void) {} #endif extern const char vlan_fullname[]; extern const char vlan_version[]; int vlan_netlink_init(void); void vlan_netlink_fini(void); extern struct rtnl_link_ops vlan_link_ops; extern unsigned int vlan_net_id; struct proc_dir_entry; struct vlan_net { /* /proc/net/vlan */ struct proc_dir_entry *proc_vlan_dir; /* /proc/net/vlan/config */ struct proc_dir_entry *proc_vlan_conf; /* Determines interface naming scheme. */ unsigned short name_type; }; #endif /* !(__BEN_VLAN_802_1Q_INC__) */ |
217 249 68 68 143 248 55 217 143 249 247 249 217 217 143 143 214 189 106 217 217 55 216 216 217 143 35 143 143 143 143 143 217 217 55 55 55 143 143 35 33 295 26 360 359 379 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif |
55 792 55 1216 54 1215 100 55 55 816 346 215 57 532 395 57 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> */ #ifndef __ASM_CPUFEATURE_H #define __ASM_CPUFEATURE_H #include <asm/alternative-macros.h> #include <asm/cpucaps.h> #include <asm/cputype.h> #include <asm/hwcap.h> #include <asm/sysreg.h> #define MAX_CPU_FEATURES 192 #define cpu_feature(x) KERNEL_HWCAP_ ## x #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 #define ARM64_SW_FEATURE_OVERRIDE_HVHE 4 #define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF 8 #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/cpumask.h> /* * CPU feature register tracking * * The safe value of a CPUID feature field is dependent on the implications * of the values assigned to it by the architecture. Based on the relationship * between the values, the features are classified into 3 types - LOWER_SAFE, * HIGHER_SAFE and EXACT. * * The lowest value of all the CPUs is chosen for LOWER_SAFE and highest * for HIGHER_SAFE. It is expected that all CPUs have the same value for * a field when EXACT is specified, failing which, the safe value specified * in the table is chosen. */ enum ftr_type { FTR_EXACT, /* Use a predefined safe value */ FTR_LOWER_SAFE, /* Smaller value is safe */ FTR_HIGHER_SAFE, /* Bigger value is safe */ FTR_HIGHER_OR_ZERO_SAFE, /* Bigger value is safe, but 0 is biggest */ }; #define FTR_STRICT true /* SANITY check strict matching required */ #define FTR_NONSTRICT false /* SANITY check ignored */ #define FTR_SIGNED true /* Value should be treated as signed */ #define FTR_UNSIGNED false /* Value should be treated as unsigned */ #define FTR_VISIBLE true /* Feature visible to the user space */ #define FTR_HIDDEN false /* Feature is hidden from the user */ #define FTR_VISIBLE_IF_IS_ENABLED(config) \ (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN) struct arm64_ftr_bits { bool sign; /* Value is signed ? */ bool visible; bool strict; /* CPU Sanity check: strict matching required ? */ enum ftr_type type; u8 shift; u8 width; s64 safe_val; /* safe value for FTR_EXACT features */ }; /* * Describe the early feature override to the core override code: * * @val Values that are to be merged into the final * sanitised value of the register. Only the bitfields * set to 1 in @mask are valid * @mask Mask of the features that are overridden by @val * * A @mask field set to full-1 indicates that the corresponding field * in @val is a valid override. * * A @mask field set to full-0 with the corresponding @val field set * to full-0 denotes that this field has no override * * A @mask field set to full-0 with the corresponding @val field set * to full-1 denotes that this field has an invalid override. */ struct arm64_ftr_override { u64 val; u64 mask; }; /* * @arm64_ftr_reg - Feature register * @strict_mask Bits which should match across all CPUs for sanity. * @sys_val Safe value across the CPUs (system view) */ struct arm64_ftr_reg { const char *name; u64 strict_mask; u64 user_mask; u64 sys_val; u64 user_val; struct arm64_ftr_override *override; const struct arm64_ftr_bits *ftr_bits; }; extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0; /* * CPU capabilities: * * We use arm64_cpu_capabilities to represent system features, errata work * arounds (both used internally by kernel and tracked in system_cpucaps) and * ELF HWCAPs (which are exposed to user). * * To support systems with heterogeneous CPUs, we need to make sure that we * detect the capabilities correctly on the system and take appropriate * measures to ensure there are no incompatibilities. * * This comment tries to explain how we treat the capabilities. * Each capability has the following list of attributes : * * 1) Scope of Detection : The system detects a given capability by * performing some checks at runtime. This could be, e.g, checking the * value of a field in CPU ID feature register or checking the cpu * model. The capability provides a call back ( @matches() ) to * perform the check. Scope defines how the checks should be performed. * There are three cases: * * a) SCOPE_LOCAL_CPU: check all the CPUs and "detect" if at least one * matches. This implies, we have to run the check on all the * booting CPUs, until the system decides that state of the * capability is finalised. (See section 2 below) * Or * b) SCOPE_SYSTEM: check all the CPUs and "detect" if all the CPUs * matches. This implies, we run the check only once, when the * system decides to finalise the state of the capability. If the * capability relies on a field in one of the CPU ID feature * registers, we use the sanitised value of the register from the * CPU feature infrastructure to make the decision. * Or * c) SCOPE_BOOT_CPU: Check only on the primary boot CPU to detect the * feature. This category is for features that are "finalised" * (or used) by the kernel very early even before the SMP cpus * are brought up. * * The process of detection is usually denoted by "update" capability * state in the code. * * 2) Finalise the state : The kernel should finalise the state of a * capability at some point during its execution and take necessary * actions if any. Usually, this is done, after all the boot-time * enabled CPUs are brought up by the kernel, so that it can make * better decision based on the available set of CPUs. However, there * are some special cases, where the action is taken during the early * boot by the primary boot CPU. (e.g, running the kernel at EL2 with * Virtualisation Host Extensions). The kernel usually disallows any * changes to the state of a capability once it finalises the capability * and takes any action, as it may be impossible to execute the actions * safely. A CPU brought up after a capability is "finalised" is * referred to as "Late CPU" w.r.t the capability. e.g, all secondary * CPUs are treated "late CPUs" for capabilities determined by the boot * CPU. * * At the moment there are two passes of finalising the capabilities. * a) Boot CPU scope capabilities - Finalised by primary boot CPU via * setup_boot_cpu_capabilities(). * b) Everything except (a) - Run via setup_system_capabilities(). * * 3) Verification: When a CPU is brought online (e.g, by user or by the * kernel), the kernel should make sure that it is safe to use the CPU, * by verifying that the CPU is compliant with the state of the * capabilities finalised already. This happens via : * * secondary_start_kernel()-> check_local_cpu_capabilities() * * As explained in (2) above, capabilities could be finalised at * different points in the execution. Each newly booted CPU is verified * against the capabilities that have been finalised by the time it * boots. * * a) SCOPE_BOOT_CPU : All CPUs are verified against the capability * except for the primary boot CPU. * * b) SCOPE_LOCAL_CPU, SCOPE_SYSTEM: All CPUs hotplugged on by the * user after the kernel boot are verified against the capability. * * If there is a conflict, the kernel takes an action, based on the * severity (e.g, a CPU could be prevented from booting or cause a * kernel panic). The CPU is allowed to "affect" the state of the * capability, if it has not been finalised already. See section 5 * for more details on conflicts. * * 4) Action: As mentioned in (2), the kernel can take an action for each * detected capability, on all CPUs on the system. Appropriate actions * include, turning on an architectural feature, modifying the control * registers (e.g, SCTLR, TCR etc.) or patching the kernel via * alternatives. The kernel patching is batched and performed at later * point. The actions are always initiated only after the capability * is finalised. This is usally denoted by "enabling" the capability. * The actions are initiated as follows : * a) Action is triggered on all online CPUs, after the capability is * finalised, invoked within the stop_machine() context from * enable_cpu_capabilitie(). * * b) Any late CPU, brought up after (1), the action is triggered via: * * check_local_cpu_capabilities() -> verify_local_cpu_capabilities() * * 5) Conflicts: Based on the state of the capability on a late CPU vs. * the system state, we could have the following combinations : * * x-----------------------------x * | Type | System | Late CPU | * |-----------------------------| * | a | y | n | * |-----------------------------| * | b | n | y | * x-----------------------------x * * Two separate flag bits are defined to indicate whether each kind of * conflict can be allowed: * ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU - Case(a) is allowed * ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU - Case(b) is allowed * * Case (a) is not permitted for a capability that the system requires * all CPUs to have in order for the capability to be enabled. This is * typical for capabilities that represent enhanced functionality. * * Case (b) is not permitted for a capability that must be enabled * during boot if any CPU in the system requires it in order to run * safely. This is typical for erratum work arounds that cannot be * enabled after the corresponding capability is finalised. * * In some non-typical cases either both (a) and (b), or neither, * should be permitted. This can be described by including neither * or both flags in the capability's type field. * * In case of a conflict, the CPU is prevented from booting. If the * ARM64_CPUCAP_PANIC_ON_CONFLICT flag is specified for the capability, * then a kernel panic is triggered. */ /* * Decide how the capability is detected. * On any local CPU vs System wide vs the primary boot CPU */ #define ARM64_CPUCAP_SCOPE_LOCAL_CPU ((u16)BIT(0)) #define ARM64_CPUCAP_SCOPE_SYSTEM ((u16)BIT(1)) /* * The capabilitiy is detected on the Boot CPU and is used by kernel * during early boot. i.e, the capability should be "detected" and * "enabled" as early as possibly on all booting CPUs. */ #define ARM64_CPUCAP_SCOPE_BOOT_CPU ((u16)BIT(2)) #define ARM64_CPUCAP_SCOPE_MASK \ (ARM64_CPUCAP_SCOPE_SYSTEM | \ ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_SCOPE_BOOT_CPU) #define SCOPE_SYSTEM ARM64_CPUCAP_SCOPE_SYSTEM #define SCOPE_LOCAL_CPU ARM64_CPUCAP_SCOPE_LOCAL_CPU #define SCOPE_BOOT_CPU ARM64_CPUCAP_SCOPE_BOOT_CPU #define SCOPE_ALL ARM64_CPUCAP_SCOPE_MASK /* * Is it permitted for a late CPU to have this capability when system * hasn't already enabled it ? */ #define ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU ((u16)BIT(4)) /* Is it safe for a late CPU to miss this capability when system has it */ #define ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU ((u16)BIT(5)) /* Panic when a conflict is detected */ #define ARM64_CPUCAP_PANIC_ON_CONFLICT ((u16)BIT(6)) /* * CPU errata workarounds that need to be enabled at boot time if one or * more CPUs in the system requires it. When one of these capabilities * has been enabled, it is safe to allow any CPU to boot that doesn't * require the workaround. However, it is not safe if a "late" CPU * requires a workaround and the system hasn't enabled it already. */ #define ARM64_CPUCAP_LOCAL_CPU_ERRATUM \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) /* * CPU feature detected at boot time based on system-wide value of a * feature. It is safe for a late CPU to have this feature even though * the system hasn't enabled it, although the feature will not be used * by Linux in this case. If the system has enabled this feature already, * then every late CPU must have it. */ #define ARM64_CPUCAP_SYSTEM_FEATURE \ (ARM64_CPUCAP_SCOPE_SYSTEM | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) /* * CPU feature detected at boot time based on feature of one or more CPUs. * All possible conflicts for a late CPU are ignored. * NOTE: this means that a late CPU with the feature will *not* cause the * capability to be advertised by cpus_have_*cap()! */ #define ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU | \ ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) /* * CPU feature detected at boot time, on one or more CPUs. A late CPU * is not allowed to have the capability when the system doesn't have it. * It is Ok for a late CPU to miss the feature. */ #define ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE \ (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) /* * CPU feature used early in the boot based on the boot CPU. All secondary * CPUs must match the state of the capability as detected by the boot CPU. In * case of a conflict, a kernel panic is triggered. */ #define ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PANIC_ON_CONFLICT) /* * CPU feature used early in the boot based on the boot CPU. It is safe for a * late CPU to have this feature even though the boot CPU hasn't enabled it, * although the feature will not be used by Linux in this case. If the boot CPU * has enabled this feature already, then every late CPU must have it. */ #define ARM64_CPUCAP_BOOT_CPU_FEATURE \ (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) struct arm64_cpu_capabilities { const char *desc; u16 capability; u16 type; bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope); /* * Take the appropriate actions to configure this capability * for this CPU. If the capability is detected by the kernel * this will be called on all the CPUs in the system, * including the hotplugged CPUs, regardless of whether the * capability is available on that specific CPU. This is * useful for some capabilities (e.g, working around CPU * errata), where all the CPUs must take some action (e.g, * changing system control/configuration). Thus, if an action * is required only if the CPU has the capability, then the * routine must check it before taking any action. */ void (*cpu_enable)(const struct arm64_cpu_capabilities *cap); union { struct { /* To be used for erratum handling only */ struct midr_range midr_range; const struct arm64_midr_revidr { u32 midr_rv; /* revision/variant */ u32 revidr_mask; } * const fixed_revs; }; const struct midr_range *midr_range_list; struct { /* Feature register checking */ u32 sys_reg; u8 field_pos; u8 field_width; u8 min_field_value; u8 max_field_value; u8 hwcap_type; bool sign; unsigned long hwcap; }; }; /* * An optional list of "matches/cpu_enable" pair for the same * "capability" of the same "type" as described by the parent. * Only matches(), cpu_enable() and fields relevant to these * methods are significant in the list. The cpu_enable is * invoked only if the corresponding entry "matches()". * However, if a cpu_enable() method is associated * with multiple matches(), care should be taken that either * the match criteria are mutually exclusive, or that the * method is robust against being called multiple times. */ const struct arm64_cpu_capabilities *match_list; const struct cpumask *cpus; }; static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap) { return cap->type & ARM64_CPUCAP_SCOPE_MASK; } /* * Generic helper for handling capabilities with multiple (match,enable) pairs * of call backs, sharing the same capability bit. * Iterate over each entry to see if at least one matches. */ static inline bool cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry, int scope) { const struct arm64_cpu_capabilities *caps; for (caps = entry->match_list; caps->matches; caps++) if (caps->matches(caps, scope)) return true; return false; } static __always_inline bool is_vhe_hyp_code(void) { /* Only defined for code run in VHE hyp context */ return __is_defined(__KVM_VHE_HYPERVISOR__); } static __always_inline bool is_nvhe_hyp_code(void) { /* Only defined for code run in NVHE hyp context */ return __is_defined(__KVM_NVHE_HYPERVISOR__); } static __always_inline bool is_hyp_code(void) { return is_vhe_hyp_code() || is_nvhe_hyp_code(); } extern DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); extern DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); #define for_each_available_cap(cap) \ for_each_set_bit(cap, system_cpucaps, ARM64_NCAPS) bool this_cpu_has_cap(unsigned int cap); void cpu_set_feature(unsigned int num); bool cpu_have_feature(unsigned int num); unsigned long cpu_get_elf_hwcap(void); unsigned long cpu_get_elf_hwcap2(void); unsigned long cpu_get_elf_hwcap3(void); #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) static __always_inline bool boot_capabilities_finalized(void) { return alternative_has_cap_likely(ARM64_ALWAYS_BOOT); } static __always_inline bool system_capabilities_finalized(void) { return alternative_has_cap_likely(ARM64_ALWAYS_SYSTEM); } /* * Test for a capability with a runtime check. * * Before the capability is detected, this returns false. */ static __always_inline bool cpus_have_cap(unsigned int num) { if (__builtin_constant_p(num) && !cpucap_is_possible(num)) return false; if (num >= ARM64_NCAPS) return false; return arch_test_bit(num, system_cpucaps); } /* * Test for a capability without a runtime check. * * Before boot capabilities are finalized, this will BUG(). * After boot capabilities are finalized, this is patched to avoid a runtime * check. * * @num must be a compile-time constant. */ static __always_inline bool cpus_have_final_boot_cap(int num) { if (boot_capabilities_finalized()) return alternative_has_cap_unlikely(num); else BUG(); } /* * Test for a capability without a runtime check. * * Before system capabilities are finalized, this will BUG(). * After system capabilities are finalized, this is patched to avoid a runtime * check. * * @num must be a compile-time constant. */ static __always_inline bool cpus_have_final_cap(int num) { if (system_capabilities_finalized()) return alternative_has_cap_unlikely(num); else BUG(); } static inline int __attribute_const__ cpuid_feature_extract_signed_field_width(u64 features, int field, int width) { return (s64)(features << (64 - width - field)) >> (64 - width); } static inline int __attribute_const__ cpuid_feature_extract_signed_field(u64 features, int field) { return cpuid_feature_extract_signed_field_width(features, field, 4); } static __always_inline unsigned int __attribute_const__ cpuid_feature_extract_unsigned_field_width(u64 features, int field, int width) { return (u64)(features << (64 - width - field)) >> (64 - width); } static __always_inline unsigned int __attribute_const__ cpuid_feature_extract_unsigned_field(u64 features, int field) { return cpuid_feature_extract_unsigned_field_width(features, field, 4); } /* * Fields that identify the version of the Performance Monitors Extension do * not follow the standard ID scheme. See ARM DDI 0487E.a page D13-2825, * "Alternative ID scheme used for the Performance Monitors Extension version". */ static inline u64 __attribute_const__ cpuid_feature_cap_perfmon_field(u64 features, int field, u64 cap) { u64 val = cpuid_feature_extract_unsigned_field(features, field); u64 mask = GENMASK_ULL(field + 3, field); /* Treat IMPLEMENTATION DEFINED functionality as unimplemented */ if (val == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val = 0; if (val > cap) { features &= ~mask; features |= (cap << field) & mask; } return features; } static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp) { return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift); } static inline u64 arm64_ftr_reg_user_value(const struct arm64_ftr_reg *reg) { return (reg->user_val | (reg->sys_val & reg->user_mask)); } static inline int __attribute_const__ cpuid_feature_extract_field_width(u64 features, int field, int width, bool sign) { if (WARN_ON_ONCE(!width)) width = 4; return (sign) ? cpuid_feature_extract_signed_field_width(features, field, width) : cpuid_feature_extract_unsigned_field_width(features, field, width); } static inline int __attribute_const__ cpuid_feature_extract_field(u64 features, int field, bool sign) { return cpuid_feature_extract_field_width(features, field, 4, sign); } static inline s64 arm64_ftr_value(const struct arm64_ftr_bits *ftrp, u64 val) { return (s64)cpuid_feature_extract_field_width(val, ftrp->shift, ftrp->width, ftrp->sign); } static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0) { return cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT) == 0x1 || cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT) == 0x1; } static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL1_SHIFT); return val == ID_AA64PFR0_EL1_EL1_AARCH32; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL0_SHIFT); return val == ID_AA64PFR0_EL1_EL0_AARCH32; } static inline bool id_aa64pfr0_sve(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SVE_SHIFT); return val > 0; } static inline bool id_aa64pfr1_sme(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_SME_SHIFT); return val > 0; } static inline bool id_aa64pfr0_mpam(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT); return val > 0; } static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); return val >= ID_AA64PFR1_EL1_MTE_MTE2; } void __init setup_boot_cpu_features(void); void __init setup_system_features(void); void __init setup_user_features(void); void check_local_cpu_capabilities(void); u64 read_sanitised_ftr_reg(u32 id); u64 __read_sysreg_by_encoding(u32 sys_id); static inline bool cpu_supports_mixed_endian_el0(void) { return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); } static inline bool supports_csv2p3(int scope) { u64 pfr0; u8 csv2_val; if (scope == SCOPE_LOCAL_CPU) pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); else pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); csv2_val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT); return csv2_val == 3; } static inline bool supports_clearbhb(int scope) { u64 isar2; if (scope == SCOPE_LOCAL_CPU) isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1); else isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); return cpuid_feature_extract_unsigned_field(isar2, ID_AA64ISAR2_EL1_CLRBHB_SHIFT); } const struct cpumask *system_32bit_el0_cpumask(void); const struct cpumask *fallback_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); static inline bool system_supports_32bit_el0(void) { u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); return static_branch_unlikely(&arm64_mismatched_32bit_el0) || id_aa64pfr0_32bit_el0(pfr0); } static inline bool system_supports_4kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN4_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX); } static inline bool system_supports_64kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN64_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX); } static inline bool system_supports_16kb_granule(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN16_SHIFT); return (val >= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN) && (val <= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX); } static inline bool system_supports_mixed_endian_el0(void) { return id_aa64mmfr0_mixed_endian_el0(read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1)); } static inline bool system_supports_mixed_endian(void) { u64 mmfr0; u32 val; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); val = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT); return val == 0x1; } static __always_inline bool system_supports_fpsimd(void) { return alternative_has_cap_likely(ARM64_HAS_FPSIMD); } static inline bool system_uses_hw_pan(void) { return alternative_has_cap_unlikely(ARM64_HAS_PAN); } static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && !system_uses_hw_pan(); } static __always_inline bool system_supports_sve(void) { return alternative_has_cap_unlikely(ARM64_SVE); } static __always_inline bool system_supports_sme(void) { return alternative_has_cap_unlikely(ARM64_SME); } static __always_inline bool system_supports_sme2(void) { return alternative_has_cap_unlikely(ARM64_SME2); } static __always_inline bool system_supports_fa64(void) { return alternative_has_cap_unlikely(ARM64_SME_FA64); } static __always_inline bool system_supports_tpidr2(void) { return system_supports_sme(); } static __always_inline bool system_supports_fpmr(void) { return alternative_has_cap_unlikely(ARM64_HAS_FPMR); } static __always_inline bool system_supports_cnp(void) { return alternative_has_cap_unlikely(ARM64_HAS_CNP); } static inline bool system_supports_address_auth(void) { return cpus_have_final_boot_cap(ARM64_HAS_ADDRESS_AUTH); } static inline bool system_supports_generic_auth(void) { return alternative_has_cap_unlikely(ARM64_HAS_GENERIC_AUTH); } static inline bool system_has_full_ptr_auth(void) { return system_supports_address_auth() && system_supports_generic_auth(); } static __always_inline bool system_uses_irq_prio_masking(void) { return alternative_has_cap_unlikely(ARM64_HAS_GIC_PRIO_MASKING); } static inline bool system_supports_mte(void) { return alternative_has_cap_unlikely(ARM64_MTE); } static inline bool system_has_prio_mask_debugging(void) { return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) && system_uses_irq_prio_masking(); } static inline bool system_supports_bti(void) { return cpus_have_final_cap(ARM64_BTI); } static inline bool system_supports_bti_kernel(void) { return IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpus_have_final_boot_cap(ARM64_BTI); } static inline bool system_supports_tlb_range(void) { return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE); } static inline bool system_supports_lpa2(void) { return cpus_have_final_cap(ARM64_HAS_LPA2); } static inline bool system_supports_poe(void) { return alternative_has_cap_unlikely(ARM64_HAS_S1POE); } static inline bool system_supports_gcs(void) { return alternative_has_cap_unlikely(ARM64_HAS_GCS); } static inline bool system_supports_haft(void) { return cpus_have_final_cap(ARM64_HAFT); } static __always_inline bool system_supports_mpam(void) { return alternative_has_cap_unlikely(ARM64_MPAM); } static __always_inline bool system_supports_mpam_hcr(void) { return alternative_has_cap_unlikely(ARM64_MPAM_HCR); } int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { case ID_AA64MMFR0_EL1_PARANGE_32: return 32; case ID_AA64MMFR0_EL1_PARANGE_36: return 36; case ID_AA64MMFR0_EL1_PARANGE_40: return 40; case ID_AA64MMFR0_EL1_PARANGE_42: return 42; case ID_AA64MMFR0_EL1_PARANGE_44: return 44; case ID_AA64MMFR0_EL1_PARANGE_48: return 48; case ID_AA64MMFR0_EL1_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme * for fields in ID registers", ARM DDI 0487C.a, any new * value is guaranteed to be higher than what we know already. * As a safe limit, we return the limit supported by the kernel. */ default: return CONFIG_ARM64_PA_BITS; } } /* Check whether hardware update of the Access flag is supported */ static inline bool cpu_has_hw_af(void) { u64 mmfr1; if (!IS_ENABLED(CONFIG_ARM64_HW_AFDBM)) return false; /* * Use cached version to avoid emulated msr operation on KVM * guests. */ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_HAFDBS_SHIFT); } static inline bool cpu_has_pan(void) { u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); return cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_PAN_SHIFT); } #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); #else static inline bool cpu_has_amu_feat(int cpu) { return false; } #endif /* Get a cpu that supports the Activity Monitors Unit (AMU) */ extern int get_cpu_with_amu_feat(void); static inline unsigned int get_vmid_bits(u64 mmfr1) { int vmid_bits; vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1, ID_AA64MMFR1_EL1_VMIDBits_SHIFT); if (vmid_bits == ID_AA64MMFR1_EL1_VMIDBits_16) return 16; /* * Return the default here even if any reserved * value is fetched from the system register. */ return 8; } s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); extern struct arm64_ftr_override id_aa64mmfr0_override; extern struct arm64_ftr_override id_aa64mmfr1_override; extern struct arm64_ftr_override id_aa64mmfr2_override; extern struct arm64_ftr_override id_aa64pfr0_override; extern struct arm64_ftr_override id_aa64pfr1_override; extern struct arm64_ftr_override id_aa64zfr0_override; extern struct arm64_ftr_override id_aa64smfr0_override; extern struct arm64_ftr_override id_aa64isar1_override; extern struct arm64_ftr_override id_aa64isar2_override; extern struct arm64_ftr_override arm64_sw_feature_override; static inline u64 arm64_apply_feature_override(u64 val, int feat, int width, const struct arm64_ftr_override *override) { u64 oval = override->val; /* * When it encounters an invalid override (e.g., an override that * cannot be honoured due to a missing CPU feature), the early idreg * override code will set the mask to 0x0 and the value to non-zero for * the field in question. In order to determine whether the override is * valid or not for the field we are interested in, we first need to * disregard bits belonging to other fields. */ oval &= GENMASK_ULL(feat + width - 1, feat); /* * The override is valid if all value bits are accounted for in the * mask. If so, replace the masked bits with the override value. */ if (oval == (oval & override->mask)) { val &= ~override->mask; val |= oval; } /* Extract the field from the updated value */ return cpuid_feature_extract_unsigned_field(val, feat); } static inline bool arm64_test_sw_feature_override(int feat) { /* * Software features are pseudo CPU features that have no underlying * CPUID system register value to apply the override to. */ return arm64_apply_feature_override(0, feat, 4, &arm64_sw_feature_override); } static inline bool kaslr_disabled_cmdline(void) { return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR); } u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); static inline bool cpu_has_bti(void) { if (!IS_ENABLED(CONFIG_ARM64_BTI)) return false; return arm64_apply_feature_override(read_cpuid(ID_AA64PFR1_EL1), ID_AA64PFR1_EL1_BT_SHIFT, 4, &id_aa64pfr1_override); } static inline bool cpu_has_pac(void) { u64 isar1, isar2; if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) return false; isar1 = read_cpuid(ID_AA64ISAR1_EL1); isar2 = read_cpuid(ID_AA64ISAR2_EL1); if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_APA_SHIFT, 4, &id_aa64isar1_override)) return true; if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_API_SHIFT, 4, &id_aa64isar1_override)) return true; return arm64_apply_feature_override(isar2, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, &id_aa64isar2_override); } static inline bool cpu_has_lva(void) { u64 mmfr2; mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); mmfr2 &= ~id_aa64mmfr2_override.mask; mmfr2 |= id_aa64mmfr2_override.val; return cpuid_feature_extract_unsigned_field(mmfr2, ID_AA64MMFR2_EL1_VARange_SHIFT); } static inline bool cpu_has_lpa2(void) { #ifdef CONFIG_ARM64_LPA2 u64 mmfr0; int feat; mmfr0 = read_sysreg(id_aa64mmfr0_el1); mmfr0 &= ~id_aa64mmfr0_override.mask; mmfr0 |= id_aa64mmfr0_override.val; feat = cpuid_feature_extract_signed_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_SHIFT); return feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2; #else return false; #endif } #endif /* __ASSEMBLY__ */ #endif |
165 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_MMU_H #define __ASM_MMU_H #include <asm/cputype.h> #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ #define USER_ASID_BIT 48 #define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) #define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLY__ #include <linux/refcount.h> #include <asm/cpufeature.h> typedef struct { atomic64_t id; #ifdef CONFIG_COMPAT void *sigpage; #endif refcount_t pinned; void *vdso; unsigned long flags; u8 pkey_allocation_map; } mm_context_t; /* * We use atomic64_read() here because the ASID for an 'mm_struct' can * be reallocated when scheduling one of its threads following a * rollover event (see new_context() and flush_context()). In this case, * a concurrent TLBI (e.g. via try_to_unmap_one() and ptep_clear_flush()) * may use a stale ASID. This is fine in principle as the new ASID is * guaranteed to be clean in the TLB, but the TLBI routines have to take * care to handle the following race: * * CPU 0 CPU 1 CPU 2 * * // ptep_clear_flush(mm) * xchg_relaxed(pte, 0) * DSB ISHST * old = ASID(mm) * | <rollover> * | new = new_context(mm) * \-----------------> atomic_set(mm->context.id, new) * cpu_switch_mm(mm) * // Hardware walk of pte using new ASID * TLBI(old) * * In this scenario, the barrier on CPU 0 and the dependency on CPU 1 * ensure that the page-table walker on CPU 1 *must* see the invalid PTE * written by CPU 0. */ #define ASID(mm) (atomic64_read(&(mm)->context.id) & 0xffff) static inline bool arm64_kernel_unmapped_at_el0(void) { return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0); } extern void arm64_memblock_init(void); extern void paging_init(void); extern void bootmem_init(void); extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); /* * This check is triggered during the early boot before the cpufeature * is initialised. Checking the status on the local CPU allows the boot * CPU to detect the need for non-global mappings and thus avoiding a * pagetable re-write after all the CPUs are booted. This check will be * anyway run on individual CPUs, allowing us to get the consistent * state once the SMP CPUs are up and thus make the switch to non-global * mappings if required. */ static inline bool kaslr_requires_kpti(void) { /* * E0PD does a similar job to KPTI so can be used instead * where available. */ if (IS_ENABLED(CONFIG_ARM64_E0PD)) { u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1); if (cpuid_feature_extract_unsigned_field(mmfr2, ID_AA64MMFR2_EL1_E0PD_SHIFT)) return false; } /* * Systems affected by Cavium erratum 24756 are incompatible * with KPTI. */ if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { extern const struct midr_range cavium_erratum_27456_cpus[]; if (is_midr_in_range_list(read_cpuid_id(), cavium_erratum_27456_cpus)) return false; } return true; } #endif /* !__ASSEMBLY__ */ #endif |
4 4 4 4 4 4 1 3 1 3 2 2 2 2 2 1 2 4 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 | // SPDX-License-Identifier: GPL-2.0-only #undef DEBUG /* * ARM performance counter support. * * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com> * * This code is based on the sparc64 perf event code, which is in turn based * on the x86 code. */ #define pr_fmt(fmt) "hw perfevents: " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/cpu_pm.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/perf/arm_pmu.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/spinlock.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <asm/irq_regs.h> static int armpmu_count_irq_users(const int irq); struct pmu_irq_ops { void (*enable_pmuirq)(unsigned int irq); void (*disable_pmuirq)(unsigned int irq); void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid); }; static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { free_irq(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmuirq_ops = { .enable_pmuirq = enable_irq, .disable_pmuirq = disable_irq_nosync, .free_pmuirq = armpmu_free_pmuirq }; static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { free_nmi(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmunmi_ops = { .enable_pmuirq = enable_nmi, .disable_pmuirq = disable_nmi_nosync, .free_pmuirq = armpmu_free_pmunmi }; static void armpmu_enable_percpu_pmuirq(unsigned int irq) { enable_percpu_irq(irq, IRQ_TYPE_NONE); } static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_irq(irq, devid); } static const struct pmu_irq_ops percpu_pmuirq_ops = { .enable_pmuirq = armpmu_enable_percpu_pmuirq, .disable_pmuirq = disable_percpu_irq, .free_pmuirq = armpmu_free_percpu_pmuirq }; static void armpmu_enable_percpu_pmunmi(unsigned int irq) { if (!prepare_percpu_nmi(irq)) enable_percpu_nmi(irq, IRQ_TYPE_NONE); } static void armpmu_disable_percpu_pmunmi(unsigned int irq) { disable_percpu_nmi(irq); teardown_percpu_nmi(irq); } static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_nmi(irq, devid); } static const struct pmu_irq_ops percpu_pmunmi_ops = { .enable_pmuirq = armpmu_enable_percpu_pmunmi, .disable_pmuirq = armpmu_disable_percpu_pmunmi, .free_pmuirq = armpmu_free_percpu_pmunmi }; static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); static bool has_nmi; static inline u64 arm_pmu_event_max_period(struct perf_event *event) { if (event->hw.flags & ARMPMU_EVT_64BIT) return GENMASK_ULL(63, 0); else if (event->hw.flags & ARMPMU_EVT_63BIT) return GENMASK_ULL(62, 0); else if (event->hw.flags & ARMPMU_EVT_47BIT) return GENMASK_ULL(46, 0); else return GENMASK_ULL(31, 0); } static int armpmu_map_cache_event(const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u64 config) { unsigned int cache_type, cache_op, cache_result, ret; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; if (!cache_map) return -ENOENT; ret = (int)(*cache_map)[cache_type][cache_op][cache_result]; if (ret == CACHE_OP_UNSUPPORTED) return -ENOENT; return ret; } static int armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config) { int mapping; if (config >= PERF_COUNT_HW_MAX) return -EINVAL; if (!event_map) return -ENOENT; mapping = (*event_map)[config]; return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; } static int armpmu_map_raw_event(u32 raw_event_mask, u64 config) { return (int)(config & raw_event_mask); } int armpmu_map_event(struct perf_event *event, const unsigned (*event_map)[PERF_COUNT_HW_MAX], const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask) { u64 config = event->attr.config; int type = event->attr.type; if (type == event->pmu->type) return armpmu_map_raw_event(raw_event_mask, config); switch (type) { case PERF_TYPE_HARDWARE: return armpmu_map_hw_event(event_map, config); case PERF_TYPE_HW_CACHE: return armpmu_map_cache_event(cache_map, config); case PERF_TYPE_RAW: return armpmu_map_raw_event(raw_event_mask, config); } return -ENOENT; } int armpmu_event_set_period(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; u64 max_period; int ret = 0; max_period = arm_pmu_event_max_period(event); if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Limit the maximum period to prevent the counter value * from overtaking the one we are about to program. In * effect we are reducing max_period to account for * interrupt latency (and we are being very conservative). */ if (left > (max_period >> 1)) left = (max_period >> 1); local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(event, (u64)(-left) & max_period); perf_event_update_userpage(event); return ret; } u64 armpmu_event_update(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u64 delta, prev_raw_count, new_raw_count; u64 max_period = arm_pmu_event_max_period(event); again: prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(event); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count - prev_raw_count) & max_period; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } static void armpmu_read(struct perf_event *event) { armpmu_event_update(event); } static void armpmu_stop(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to update the counter, so ignore * PERF_EF_UPDATE, see comments in armpmu_start(). */ if (!(hwc->state & PERF_HES_STOPPED)) { armpmu->disable(event); armpmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } } static void armpmu_start(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to reprogram the period, so ignore * PERF_EF_RELOAD, see the comment below. */ if (flags & PERF_EF_RELOAD) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; /* * Set the period again. Some counters can't be stopped, so when we * were stopped we simply disabled the IRQ source and the counter * may have been left counting. If we don't do this step then we may * get an interrupt too soon or *way* too late if the overflow has * happened since disabling. */ armpmu_event_set_period(event); armpmu->enable(event); } static void armpmu_del(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; armpmu_stop(event, PERF_EF_UPDATE); hw_events->events[idx] = NULL; armpmu->clear_event_idx(hw_events, event); perf_event_update_userpage(event); /* Clear the allocated counter */ hwc->idx = -1; } static int armpmu_add(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx; /* An event following a process won't be stopped earlier */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return -ENOENT; /* If we don't have a space for the counter then finish early. */ idx = armpmu->get_event_idx(hw_events, event); if (idx < 0) return idx; /* * If there is an event in the counter we are going to use then make * sure it is disabled. */ event->hw.idx = idx; armpmu->disable(event); hw_events->events[idx] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (flags & PERF_EF_START) armpmu_start(event, PERF_EF_RELOAD); /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); return 0; } static int validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events, struct perf_event *event) { struct arm_pmu *armpmu; if (is_software_event(event)) return 1; /* * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The * core perf code won't check that the pmu->ctx == leader->ctx * until after pmu->event_init(event). */ if (event->pmu != pmu) return 0; if (event->state < PERF_EVENT_STATE_OFF) return 1; if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) return 1; armpmu = to_arm_pmu(event->pmu); return armpmu->get_event_idx(hw_events, event) >= 0; } static int validate_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; struct pmu_hw_events fake_pmu; /* * Initialise the fake PMU. We only need to populate the * used_mask for the purposes of validation. */ memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask)); if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; if (event == leader) return 0; for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } if (!validate_event(event->pmu, &fake_pmu, event)) return -EINVAL; return 0; } static irqreturn_t armpmu_dispatch_irq(int irq, void *dev) { struct arm_pmu *armpmu; int ret; u64 start_clock, finish_clock; /* * we request the IRQ with a (possibly percpu) struct arm_pmu**, but * the handlers expect a struct arm_pmu*. The percpu_irq framework will * do any necessary shifting, we just need to perform the first * dereference. */ armpmu = *(void **)dev; if (WARN_ON_ONCE(!armpmu)) return IRQ_NONE; start_clock = sched_clock(); ret = armpmu->handle_irq(armpmu); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } static int __hw_perf_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int mapping, ret; hwc->flags = 0; mapping = armpmu->map_event(event); if (mapping < 0) { pr_debug("event %x:%llx not supported\n", event->attr.type, event->attr.config); return mapping; } /* * We don't assign an index until we actually place the event onto * hardware. Use -1 to signify that we haven't decided where to put it * yet. For SMP systems, each core has it's own PMU so we can't do any * clever allocation or constraints checking at this point. */ hwc->idx = -1; hwc->config_base = 0; hwc->config = 0; hwc->event_base = 0; /* * Check whether we need to exclude the counter from certain modes. */ if (armpmu->set_event_filter) { ret = armpmu->set_event_filter(hwc, &event->attr); if (ret) return ret; } /* * Store the event encoding into the config_base field. */ hwc->config_base |= (unsigned long)mapping; if (!is_sampling_event(event)) { /* * For non-sampling runs, limit the sample_period to half * of the counter width. That way, the new counter value * is far less likely to overtake the previous one unless * you have some serious IRQ latency issues. */ hwc->sample_period = arm_pmu_event_max_period(event) >> 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } return validate_group(event); } static int armpmu_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); /* * Reject CPU-affine events for CPUs that are of a different class to * that which this PMU handles. Process-following events (where * event->cpu == -1) can be migrated between CPUs, and thus we have to * reject them later (in armpmu_add) if they're scheduled on a * different class of CPU. */ if (event->cpu != -1 && !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus)) return -ENOENT; /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; return __hw_perf_event_init(event); } static void armpmu_enable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; if (enabled) armpmu->start(armpmu); } static void armpmu_disable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; armpmu->stop(armpmu); } /* * In heterogeneous systems, events are specific to a particular * microarchitecture, and aren't suitable for another. Thus, only match CPUs of * the same microarchitecture. */ static bool armpmu_filter(struct pmu *pmu, int cpu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); return !cpumask_test_cpu(cpu, &armpmu->supported_cpus); } static ssize_t cpus_show(struct device *dev, struct device_attribute *attr, char *buf) { struct arm_pmu *armpmu = to_arm_pmu(dev_get_drvdata(dev)); return cpumap_print_to_pagebuf(true, buf, &armpmu->supported_cpus); } static DEVICE_ATTR_RO(cpus); static struct attribute *armpmu_common_attrs[] = { &dev_attr_cpus.attr, NULL, }; static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; static int armpmu_count_irq_users(const int irq) { int cpu, count = 0; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) == irq) count++; } return count; } static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) { const struct pmu_irq_ops *ops = NULL; int cpu; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) != irq) continue; ops = per_cpu(cpu_irq_ops, cpu); if (ops) break; } return ops; } void armpmu_free_irq(int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) return; if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); per_cpu(cpu_irq, cpu) = 0; per_cpu(cpu_irq_ops, cpu) = NULL; } int armpmu_request_irq(int irq, int cpu) { int err = 0; const irq_handler_t handler = armpmu_dispatch_irq; const struct pmu_irq_ops *irq_ops; if (!irq) return 0; if (!irq_is_percpu_devid(irq)) { unsigned long irq_flags; err = irq_force_affinity(irq, cpumask_of(cpu)); if (err && num_possible_cpus() > 1) { pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n", irq, cpu); goto err_out; } irq_flags = IRQF_PERCPU | IRQF_NOBALANCING | IRQF_NO_AUTOEN | IRQF_NO_THREAD; err = request_nmi(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); irq_ops = &pmuirq_ops; } else { has_nmi = true; irq_ops = &pmunmi_ops; } } else if (armpmu_count_irq_users(irq) == 0) { err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_percpu_irq(irq, handler, "arm-pmu", &cpu_armpmu); irq_ops = &percpu_pmuirq_ops; } else { has_nmi = true; irq_ops = &percpu_pmunmi_ops; } } else { /* Per cpudevid irq was already requested by another CPU */ irq_ops = armpmu_find_irq_ops(irq); if (WARN_ON(!irq_ops)) err = -EINVAL; } if (err) goto err_out; per_cpu(cpu_irq, cpu) = irq; per_cpu(cpu_irq_ops, cpu) = irq_ops; return 0; err_out: pr_err("unable to request IRQ%d for ARM PMU counters\n", irq); return err; } static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu) { struct pmu_hw_events __percpu *hw_events = pmu->hw_events; return per_cpu(hw_events->irq, cpu); } bool arm_pmu_irq_is_nmi(void) { return has_nmi; } /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading * junk values out of them. */ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; if (pmu->reset) pmu->reset(pmu); per_cpu(cpu_armpmu, cpu) = pmu; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); return 0; } static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); per_cpu(cpu_armpmu, cpu) = NULL; return 0; } #ifdef CONFIG_CPU_PM static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) { struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct perf_event *event; int idx; for_each_set_bit(idx, armpmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { event = hw_events->events[idx]; if (!event) continue; switch (cmd) { case CPU_PM_ENTER: /* * Stop and update the counter */ armpmu_stop(event, PERF_EF_UPDATE); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: /* * Restore and enable the counter. */ armpmu_start(event, PERF_EF_RELOAD); break; default: break; } } } static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, void *v) { struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return NOTIFY_DONE; /* * Always reset the PMU registers on power-up even if * there are no events running. */ if (cmd == CPU_PM_EXIT && armpmu->reset) armpmu->reset(armpmu); if (!enabled) return NOTIFY_OK; switch (cmd) { case CPU_PM_ENTER: armpmu->stop(armpmu); cpu_pm_pmu_setup(armpmu, cmd); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: cpu_pm_pmu_setup(armpmu, cmd); armpmu->start(armpmu); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify; return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb); } static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb); } #else static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; } static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { } #endif static int cpu_pmu_init(struct arm_pmu *cpu_pmu) { int err; err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); if (err) goto out; err = cpu_pm_pmu_register(cpu_pmu); if (err) goto out_unregister; return 0; out_unregister: cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); out: return err; } static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu) { cpu_pm_pmu_unregister(cpu_pmu); cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); } struct arm_pmu *armpmu_alloc(void) { struct arm_pmu *pmu; int cpu; pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); if (!pmu) goto out; pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL); if (!pmu->hw_events) { pr_info("failed to allocate per-cpu PMU data.\n"); goto out_free_pmu; } pmu->pmu = (struct pmu) { .pmu_enable = armpmu_enable, .pmu_disable = armpmu_disable, .event_init = armpmu_event_init, .add = armpmu_add, .del = armpmu_del, .start = armpmu_start, .stop = armpmu_stop, .read = armpmu_read, .filter = armpmu_filter, .attr_groups = pmu->attr_groups, /* * This is a CPU PMU potentially in a heterogeneous * configuration (e.g. big.LITTLE) so * PERF_PMU_CAP_EXTENDED_HW_TYPE is required to open * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a * specific PMU. */ .capabilities = PERF_PMU_CAP_EXTENDED_REGS | PERF_PMU_CAP_EXTENDED_HW_TYPE, }; pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] = &armpmu_common_attr_group; for_each_possible_cpu(cpu) { struct pmu_hw_events *events; events = per_cpu_ptr(pmu->hw_events, cpu); events->percpu_pmu = pmu; } return pmu; out_free_pmu: kfree(pmu); out: return NULL; } void armpmu_free(struct arm_pmu *pmu) { free_percpu(pmu->hw_events); kfree(pmu); } int armpmu_register(struct arm_pmu *pmu) { int ret; ret = cpu_pmu_init(pmu); if (ret) return ret; if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (ret) goto out_destroy; pr_info("enabled with %s PMU driver, %d (%*pb) counters available%s\n", pmu->name, bitmap_weight(pmu->cntr_mask, ARMPMU_MAX_HWEVENTS), ARMPMU_MAX_HWEVENTS, &pmu->cntr_mask, has_nmi ? ", using NMIs" : ""); kvm_host_pmu_init(pmu); return 0; out_destroy: cpu_pmu_destroy(pmu); return ret; } static int arm_pmu_hp_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING, "perf/arm/pmu:starting", arm_perf_starting_cpu, arm_perf_teardown_cpu); if (ret) pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n", ret); return ret; } subsys_initcall(arm_pmu_hp_init); |
425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> struct wb_stats { unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; }; static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static void collect_wb_stats(struct wb_stats *stats, struct bdi_writeback *wb) { struct inode *inode; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); } #ifdef CONFIG_CGROUP_WRITEBACK static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; collect_wb_stats(stats, wb); wb_put(wb); } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { collect_wb_stats(stats, &bdi->wb); } #endif static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct wb_stats stats; unsigned long tot_bw; global_dirty_limits(&background_thresh, &dirty_thresh); memset(&stats, 0, sizeof(stats)); stats.dirty_thresh = dirty_thresh; bdi_collect_stats(bdi, &stats); tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", K(stats.nr_writeback), K(stats.nr_reclaimable), K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), K(stats.nr_dirtied), K(stats.nr_written), K(tot_bw), stats.nr_dirty, stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, struct wb_stats *stats) { seq_printf(m, "WbCgIno: %10lu\n" "WbWriteback: %10lu kB\n" "WbReclaimable: %10lu kB\n" "WbDirtyThresh: %10lu kB\n" "WbDirtied: %10lu kB\n" "WbWritten: %10lu kB\n" "WbWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "state: %10lx\n\n", #ifdef CONFIG_CGROUP_WRITEBACK cgroup_ino(wb->memcg_css->cgroup), #else 1ul, #endif K(stats->nr_writeback), K(stats->nr_reclaimable), K(stats->wb_thresh), K(stats->nr_dirtied), K(stats->nr_written), K(wb->avg_write_bandwidth), stats->nr_dirty, stats->nr_io, stats->nr_more_io, stats->nr_dirty_time, wb->state); } static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) continue; collect_wb_stats(&stats, wb); /* * Calculate thresh of wb in writeback cgroup which is min of * thresh in global domain and thresh in cgroup domain. Drop * rcu lock because cgwb_calc_thresh may sleep in * cgroup_rstat_flush. We can do so here because we have a ref. */ if (mem_cgroup_wb_domain(wb)) { rcu_read_unlock(); stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); rcu_read_lock(); } wb_stats_show(m, wb, &stats); wb_put(wb); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); if (err) fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { WARN_ON(delayed_work_pending(&wb->dwork)); percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { del_timer_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); |
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 53 39 53 18 1 1 1 1 3 2 16 4 34 19 2 14 11 6 3 30 11 19 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 | // SPDX-License-Identifier: GPL-2.0-only /* * VGIC MMIO handling functions */ #include <linux/bitops.h> #include <linux/bsearch.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <kvm/iodev.h> #include <kvm/arm_arch_timer.h> #include <kvm/arm_vgic.h> #include "vgic.h" #include "vgic-mmio.h" unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return 0; } unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return -1UL; } void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { /* Ignore */ } int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { /* Ignore */ return 0; } unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->group) value |= BIT(i); vgic_put_irq(vcpu->kvm, irq); } return value; } static void vgic_update_vsgi(struct vgic_irq *irq) { WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group)); } void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->group = !!(val & BIT(i)); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { vgic_update_vsgi(irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } else { vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } vgic_put_irq(vcpu->kvm, irq); } } /* * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value * of the enabled bit, so there is only one function for both here. */ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->enabled) value |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return value; } void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { if (!irq->enabled) { struct irq_data *data; irq->enabled = true; data = &irq_to_desc(irq->host_irq)->irq_data; while (irqd_irq_disabled(data)) enable_irq(irq->host_irq); } raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } else if (vgic_irq_is_mapped_level(irq)) { bool was_high = irq->line_level; /* * We need to update the state of the interrupt because * the guest might have changed the state of the device * while the interrupt was disabled at the VGIC level. */ irq->line_level = vgic_get_phys_line_level(irq); /* * Deactivate the physical interrupt so the GIC will let * us know when it is asserted again. */ if (!irq->active && was_high && !irq->line_level) vgic_irq_set_phys_active(irq, false); } irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled) disable_irq_nosync(irq->host_irq); irq->enabled = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } return 0; } int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } return 0; } static unsigned long __read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); unsigned long flags; bool val; /* * When used from userspace with a GICv3 model: * * Pending state of interrupt is latched in pending_latch * variable. Userspace will save and restore pending state * and line_level separately. * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { int err; val = false; err = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); } else if (!is_user && vgic_irq_is_mapped_level(irq)) { val = vgic_get_phys_line_level(irq); } else { switch (vcpu->kvm->arch.vgic.vgic_model) { case KVM_DEV_TYPE_ARM_VGIC_V3: if (is_user) { val = irq->pending_latch; break; } fallthrough; default: val = irq_is_pending(irq); break; } } value |= ((u32)val << i); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } return value; } unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __read_pending(vcpu, addr, len, false); } unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __read_pending(vcpu, addr, len, true); } static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { return (vgic_irq_is_sgi(irq->intid) && vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2); } static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * GICv2 SGIs are terribly broken. We can't restore * the source of the interrupt, so just pick the vcpu * itself as the source... */ if (is_vgic_v2_sgi(vcpu, irq)) irq->source |= BIT(vcpu->vcpu_id); if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to inject it */ int err; err = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, true); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } irq->pending_latch = true; if (irq->hw && !is_user) vgic_irq_set_phys_active(irq, true); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __set_pending(vcpu, addr, len, val, false); } int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __set_pending(vcpu, addr, len, val, true); return 0; } /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { irq->pending_latch = false; /* * We don't want the guest to effectively mask the physical * interrupt by doing a write to SPENDR followed by a write to * CPENDR for HW interrupts, so we clear the active state on * the physical side if the virtual interrupt is not active. * This may lead to taking an additional interrupt on the * host, but that should not be a problem as the worst that * can happen is an additional vgic injection. We also clear * the pending state to maintain proper semantics for edge HW * interrupts. */ vgic_irq_set_phys_pending(irq, false); if (!irq->active) vgic_irq_set_phys_active(irq, false); } static void __clear_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val, bool is_user) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */ if (is_vgic_v2_sgi(vcpu, irq) && !is_user) { vgic_put_irq(vcpu->kvm, irq); continue; } raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * More fun with GICv2 SGIs! If we're clearing one of them * from userspace, which source vcpu to clear? Let's not * even think of it, and blow the whole set. */ if (is_vgic_v2_sgi(vcpu, irq)) irq->source = 0; if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* HW SGI? Ask the GIC to clear its pending bit */ int err; err = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, false); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); continue; } if (irq->hw && !is_user) vgic_hw_irq_cpending(vcpu, irq); else irq->pending_latch = false; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __clear_pending(vcpu, addr, len, val, false); } int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __clear_pending(vcpu, addr, len, val, true); return 0; } /* * If we are fiddling with an IRQ's active state, we have to make sure the IRQ * is not queued on some running VCPU's LRs, because then the change to the * active state can be overwritten when the VCPU's state is synced coming back * from the guest. * * For shared interrupts as well as GICv3 private interrupts accessed from the * non-owning CPU, we have to stop all the VCPUs because interrupts can be * migrated while we don't hold the IRQ locks and we don't want to be chasing * moving targets. * * For GICv2 private interrupts we don't have to do anything because * userspace accesses to the VGIC state already require all VCPUs to be * stopped, and only the VCPU itself can modify its private interrupts * active state, which guarantees that the VCPU is not running. */ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_access_active_prepare */ static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) { if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && vcpu != kvm_get_running_vcpu()) || intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; int i; /* Loop over all IRQs affected by this read */ for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Even for HW interrupts, don't evaluate the HW state as * all the guest is interested in is the virtual state. */ if (irq->active) value |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return value; } unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 val; mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); val = __vgic_mmio_read_active(vcpu, addr, len); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); return val; } unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { return __vgic_mmio_read_active(vcpu, addr, len); } /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active, bool is_uaccess) { if (is_uaccess) return; irq->active = active; vgic_irq_set_phys_active(irq, active); } static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active) { unsigned long flags; struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw && !vgic_irq_is_sgi(irq->intid)) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) { /* * GICv4.1 VSGI feature doesn't track an active state, * so let's not kid ourselves, there is nothing we can * do here. */ irq->active = false; } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; u8 active_source; irq->active = active; /* * The GICv2 architecture indicates that the source CPUID for * an SGI should be provided during an EOI which implies that * the active state is stored somewhere, but at the same time * this state is not architecturally exposed anywhere and we * have no way of knowing the right source. * * This may lead to a VCPU not being able to receive * additional instances of a particular SGI after migration * for a GICv2 VM on some GIC implementations. Oh well. */ active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) irq->active_source = active_source; } if (irq->active) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_cactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_cactive(vcpu, addr, len, val); return 0; } static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); vgic_put_irq(vcpu->kvm, irq); } } void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->arch.config_lock); vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_sactive(vcpu, addr, len, val); vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->arch.config_lock); } int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { __vgic_mmio_write_sactive(vcpu, addr, len, val); return 0; } unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 8); int i; u64 val = 0; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); val |= (u64)irq->priority << (i * 8); vgic_put_irq(vcpu->kvm, irq); } return val; } /* * We currently don't handle changing the priority of an interrupt that * is already pending on a VCPU. If there is a need for this, we would * need to make this VCPU exit and re-evaluate the priorities, potentially * leading to this interrupt getting presented now to the guest (if it has * been masked by the priority mask before). */ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 8); int i; unsigned long flags; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); if (irq->hw && vgic_irq_is_sgi(irq->intid)) vgic_update_vsgi(irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 2); u32 value = 0; int i; for (i = 0; i < len * 4; i++) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); vgic_put_irq(vcpu->kvm, irq); } return value; } void vgic_mmio_write_config(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { u32 intid = VGIC_ADDR_TO_INTID(addr, 2); int i; unsigned long flags; for (i = 0; i < len * 4; i++) { struct vgic_irq *irq; /* * The configuration cannot be changed for SGIs in general, * for PPIs this is IMPLEMENTATION DEFINED. The arch timer * code relies on PPIs being level triggered, so we also * make them read-only here. */ if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; irq = vgic_get_irq(vcpu->kvm, intid + i); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) irq->config = VGIC_CONFIG_EDGE; else irq->config = VGIC_CONFIG_LEVEL; raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) { int i; u32 val = 0; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; for (i = 0; i < 32; i++) { struct vgic_irq *irq; if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; irq = vgic_get_vcpu_irq(vcpu, intid + i); if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level) val |= (1U << i); vgic_put_irq(vcpu->kvm, irq); } return val; } void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val) { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; unsigned long flags; for (i = 0; i < 32; i++) { struct vgic_irq *irq; bool new_level; if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs) continue; irq = vgic_get_vcpu_irq(vcpu, intid + i); /* * Line level is set irrespective of irq type * (level or edge) to avoid dependency that VM should * restore irq config before line level. */ new_level = !!(val & (1U << i)); raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->line_level = new_level; if (new_level) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } static int match_region(const void *key, const void *elt) { const unsigned int offset = (unsigned long)key; const struct vgic_register_region *region = elt; if (offset < region->reg_offset) return -1; if (offset >= region->reg_offset + region->len) return 1; return 0; } const struct vgic_register_region * vgic_find_mmio_region(const struct vgic_register_region *regions, int nr_regions, unsigned int offset) { return bsearch((void *)(uintptr_t)offset, regions, nr_regions, sizeof(regions[0]), match_region); } void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_set_vmcr(vcpu, vmcr); else vgic_v3_set_vmcr(vcpu, vmcr); } void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_get_vmcr(vcpu, vmcr); else vgic_v3_get_vmcr(vcpu, vmcr); } /* * kvm_mmio_read_buf() returns a value in a format where it can be converted * to a byte array and be directly observed as the guest wanted it to appear * in memory if it had done the store itself, which is LE for the GIC, as the * guest knows the GIC is always LE. * * We convert this value to the CPUs native format to deal with it as a data * value. */ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len) { unsigned long data = kvm_mmio_read_buf(val, len); switch (len) { case 1: return data; case 2: return le16_to_cpu(data); case 4: return le32_to_cpu(data); default: return le64_to_cpu(data); } } /* * kvm_mmio_write_buf() expects a value in a format such that if converted to * a byte array it is observed as the guest would see it if it could perform * the load directly. Since the GIC is LE, and the guest knows this, the * guest expects a value in little endian format. * * We convert the data value from the CPUs native format to LE so that the * value is returned in the proper format. */ void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, unsigned long data) { switch (len) { case 1: break; case 2: data = cpu_to_le16(data); break; case 4: data = cpu_to_le32(data); break; default: data = cpu_to_le64(data); } kvm_mmio_write_buf(buf, len, data); } static struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev) { return container_of(dev, struct vgic_io_device, dev); } static bool check_region(const struct kvm *kvm, const struct vgic_register_region *region, gpa_t addr, int len) { int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; switch (len) { case sizeof(u8): flags = VGIC_ACCESS_8bit; break; case sizeof(u32): flags = VGIC_ACCESS_32bit; break; case sizeof(u64): flags = VGIC_ACCESS_64bit; break; default: return false; } if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) { if (!region->bits_per_irq) return true; /* Do we access a non-allocated IRQ? */ return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs; } return false; } const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len) { const struct vgic_register_region *region; region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, addr - iodev->base_addr); if (!region || !check_region(vcpu->kvm, region, addr, len)) return NULL; return region; } static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, u32 *val) { const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); if (!region) { *val = 0; return 0; } r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; if (region->uaccess_read) *val = region->uaccess_read(r_vcpu, addr, sizeof(u32)); else *val = region->read(r_vcpu, addr, sizeof(u32)); return 0; } static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, const u32 *val) { const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32)); if (!region) return 0; r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; if (region->uaccess_write) return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val); region->write(r_vcpu, addr, sizeof(u32), *val); return 0; } /* * Userland access to VGIC registers. */ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val) { if (is_write) return vgic_uaccess_write(vcpu, dev, offset, val); else return vgic_uaccess_read(vcpu, dev, offset, val); } static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; unsigned long data = 0; region = vgic_get_mmio_region(vcpu, iodev, addr, len); if (!region) { memset(val, 0, len); return 0; } switch (iodev->iodev_type) { case IODEV_CPUIF: data = region->read(vcpu, addr, len); break; case IODEV_DIST: data = region->read(vcpu, addr, len); break; case IODEV_REDIST: data = region->read(iodev->redist_vcpu, addr, len); break; case IODEV_ITS: data = region->its_read(vcpu->kvm, iodev->its, addr, len); break; } vgic_data_host_to_mmio_bus(val, len, data); return 0; } static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; unsigned long data = vgic_data_mmio_bus_to_host(val, len); region = vgic_get_mmio_region(vcpu, iodev, addr, len); if (!region) return 0; switch (iodev->iodev_type) { case IODEV_CPUIF: region->write(vcpu, addr, len, data); break; case IODEV_DIST: region->write(vcpu, addr, len, data); break; case IODEV_REDIST: region->write(iodev->redist_vcpu, addr, len, data); break; case IODEV_ITS: region->its_write(vcpu->kvm, iodev->its, addr, len, data); break; } return 0; } const struct kvm_io_device_ops kvm_io_gic_ops = { .read = dispatch_mmio_read, .write = dispatch_mmio_write, }; int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type type) { struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev; unsigned int len; switch (type) { case VGIC_V2: len = vgic_v2_init_dist_iodev(io_device); break; case VGIC_V3: len = vgic_v3_init_dist_iodev(io_device); break; default: BUG_ON(1); } io_device->base_addr = dist_base_address; io_device->iodev_type = IODEV_DIST; io_device->redist_vcpu = NULL; return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address, len, &io_device->dev); } |
410 451 215 525 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */ |
297 297 296 297 297 297 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 | /* CPU control. * (C) 2001, 2002, 2003, 2004 Rusty Russell * * This code is licenced under the GPL. */ #include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> #include <linux/sched/isolation.h> #include <linux/sched/task.h> #include <linux/sched/smt.h> #include <linux/unistd.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/bug.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> #include <linux/nmi.h> #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> #include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> #include <linux/random.h> #include <linux/cc_platform.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS #include <trace/events/cpuhp.h> #include "smpboot.h" /** * struct cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state * @fail: Current CPU hotplug callback state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback * @single: Single callback invocation * @bringup: Single callback bringup or teardown selector * @node: Remote CPU node; for multi-instance, do a * single entry callback for install/remove * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @ap_sync_state: State for AP synchronization * @done_up: Signal completion to the issuer of the task for cpu-up * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; bool rollback; bool single; bool bringup; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; int result; atomic_t ap_sync_state; struct completion done_up; struct completion done_down; #endif }; static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; #ifdef CONFIG_SMP cpumask_t cpus_booted_once_mask; #endif #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else static inline void cpuhp_lock_acquire(bool bringup) { } static inline void cpuhp_lock_release(bool bringup) { } #endif /** * struct cpuhp_step - Hotplug state machine step * @name: Name of the step * @startup: Startup function of the step * @teardown: Teardown function of the step * @cant_stop: Bringup/teardown can't be stopped at this step * @multi_instance: State has multiple instances which get added afterwards */ struct cpuhp_step { const char *name; union { int (*single)(unsigned int cpu); int (*multi)(unsigned int cpu, struct hlist_node *node); } startup; union { int (*single)(unsigned int cpu); int (*multi)(unsigned int cpu, struct hlist_node *node); } teardown; /* private: */ struct hlist_head list; /* public: */ bool cant_stop; bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_hp_states[]; static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { return cpuhp_hp_states + state; } static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step) { return bringup ? !step->startup.single : !step->teardown.single; } /** * cpuhp_invoke_callback - Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked * @node: For multi-instance, do a single entry callback for install/remove * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. * * Return: %0 on success or a negative errno code */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node, struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); int (*cbm)(unsigned int cpu, struct hlist_node *node); int (*cb)(unsigned int cpu); int ret, cnt; if (st->fail == state) { st->fail = CPUHP_INVALID; return -EAGAIN; } if (cpuhp_step_empty(bringup, step)) { WARN_ON_ONCE(1); return 0; } if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } cbm = bringup ? step->startup.multi : step->teardown.multi; /* Single invocation for instance add/remove */ if (node) { WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); return ret; } /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { if (lastp && node == *lastp) break; trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); if (ret) { if (!lastp) goto err; *lastp = node; return ret; } cnt++; } if (lastp) *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ cbm = !bringup ? step->startup.multi : step->teardown.multi; if (!cbm) return ret; hlist_for_each(node, &step->list) { if (!cnt--) break; trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); /* * Rollback must not fail, */ WARN_ON_ONCE(ret); } return ret; } #ifdef CONFIG_SMP static bool cpuhp_is_ap_state(enum cpuhp_state state) { /* * The extra check for CPUHP_TEARDOWN_CPU is only for documentation * purposes as that state is handled explicitly in cpu_down. */ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; wait_for_completion(done); } static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) { struct completion *done = bringup ? &st->done_up : &st->done_down; complete(done); } /* * The former STARTING/DYING states, ran with IRQs disabled and must not fail. */ static bool cpuhp_is_atomic_state(enum cpuhp_state state) { return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; } /* Synchronization state management */ enum cpuhp_sync_state { SYNC_STATE_DEAD, SYNC_STATE_KICKED, SYNC_STATE_SHOULD_DIE, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE, SYNC_STATE_ONLINE, }; #ifdef CONFIG_HOTPLUG_CORE_SYNC /** * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown * @state: The synchronization state to set * * No synchronization point. Just update of the synchronization state, but implies * a full barrier so that the AP changes are visible before the control CPU proceeds. */ static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); (void)atomic_xchg(st, state); } void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); } static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state, enum cpuhp_sync_state next_state) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); ktime_t now, end, start = ktime_get(); int sync; end = start + 10ULL * NSEC_PER_SEC; sync = atomic_read(st); while (1) { if (sync == state) { if (!atomic_try_cmpxchg(st, &sync, next_state)) continue; return true; } now = ktime_get(); if (now > end) { /* Timeout. Leave the state unchanged */ return false; } else if (now - start < NSEC_PER_MSEC) { /* Poll for one millisecond */ arch_cpuhp_sync_state_poll(); } else { usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC); } sync = atomic_read(st); } return true; } #else /* CONFIG_HOTPLUG_CORE_SYNC */ static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { } #endif /* !CONFIG_HOTPLUG_CORE_SYNC */ #ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD /** * cpuhp_ap_report_dead - Update synchronization state to DEAD * * No synchronization point. Just update of the synchronization state. */ void cpuhp_ap_report_dead(void) { cpuhp_ap_update_sync_state(SYNC_STATE_DEAD); } void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { } /* * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down * because the AP cannot issue complete() at this stage. */ static void cpuhp_bp_sync_dead(unsigned int cpu) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); int sync = atomic_read(st); do { /* CPU can have reported dead already. Don't overwrite that! */ if (sync == SYNC_STATE_DEAD) break; } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE)); if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) { /* CPU reached dead state. Invoke the cleanup function */ arch_cpuhp_cleanup_dead_cpu(cpu); return; } /* No further action possible. Emit message and give up. */ pr_err("CPU%u failed to report dead state\n", cpu); } #else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */ static inline void cpuhp_bp_sync_dead(unsigned int cpu) { } #endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */ #ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL /** * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive * * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits * for the BP to release it. */ void cpuhp_ap_sync_alive(void) { atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE); /* Wait for the control CPU to release it. */ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE) cpu_relax(); } static bool cpuhp_can_boot_ap(unsigned int cpu) { atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); int sync = atomic_read(st); again: switch (sync) { case SYNC_STATE_DEAD: /* CPU is properly dead */ break; case SYNC_STATE_KICKED: /* CPU did not come up in previous attempt */ break; case SYNC_STATE_ALIVE: /* CPU is stuck cpuhp_ap_sync_alive(). */ break; default: /* CPU failed to report online or dead and is in limbo state. */ return false; } /* Prepare for booting */ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED)) goto again; return true; } void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { } /* * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up * because the AP cannot issue complete() so early in the bringup. */ static int cpuhp_bp_sync_alive(unsigned int cpu) { int ret = 0; if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL)) return 0; if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) { pr_err("CPU%u failed to report alive state\n", cpu); ret = -EIO; } /* Let the architecture cleanup the kick alive mechanics. */ arch_cpuhp_cleanup_kick_cpu(cpu); return ret; } #else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */ static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; } static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; } #endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */ /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } /* * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); static bool cpu_hotplug_offline_disabled __ro_after_init; void cpus_read_lock(void) { percpu_down_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_lock); int cpus_read_trylock(void) { return percpu_down_read_trylock(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_trylock); void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_unlock); void cpus_write_lock(void) { percpu_down_write(&cpu_hotplug_lock); } void cpus_write_unlock(void) { percpu_up_write(&cpu_hotplug_lock); } void lockdep_assert_cpus_held(void) { /* * We can't have hotplug operations before userspace starts running, * and some init codepaths will knowingly not take the hotplug lock. * This is all valid, so mute lockdep until it makes sense to report * unheld locks. */ if (system_state < SYSTEM_RUNNING) return; percpu_rwsem_assert_held(&cpu_hotplug_lock); } #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) { return percpu_rwsem_is_held(&cpu_hotplug_lock); } #endif static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* Declare CPU offlining not supported */ void cpu_hotplug_disable_offlining(void) { cpu_maps_update_begin(); cpu_hotplug_offline_disabled = true; cpu_maps_update_done(); } /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the * hotplug path before performing hotplug operations. So acquiring that lock * guarantees mutual exclusion from any currently running hotplug operations. */ void cpu_hotplug_disable(void) { cpu_maps_update_begin(); cpu_hotplug_disabled++; cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); static void __cpu_hotplug_enable(void) { if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) return; cpu_hotplug_disabled--; } void cpu_hotplug_enable(void) { cpu_maps_update_begin(); __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #else static void lockdep_acquire_cpus_lock(void) { } static void lockdep_release_cpus_lock(void) { } #endif /* CONFIG_HOTPLUG_CPU */ /* * Architectures that need SMT-specific errata handling during SMT hotplug * should override this. */ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; static unsigned int cpu_smt_max_threads __ro_after_init; unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX; void __init cpu_smt_disable(bool force) { if (!cpu_smt_possible()) return; if (force) { pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } cpu_smt_num_threads = 1; } /* * The decision whether SMT is supported can only be done after the full * CPU identification. Called from architecture code. */ void __init cpu_smt_set_num_threads(unsigned int num_threads, unsigned int max_threads) { WARN_ON(!num_threads || (num_threads > max_threads)); if (max_threads == 1) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; cpu_smt_max_threads = max_threads; /* * If SMT has been disabled via the kernel command line or SMT is * not supported, set cpu_smt_num_threads to 1 for consistency. * If enabled, take the architecture requested number of threads * to bring up into account. */ if (cpu_smt_control != CPU_SMT_ENABLED) cpu_smt_num_threads = 1; else if (num_threads < cpu_smt_num_threads) cpu_smt_num_threads = num_threads; } static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); return 0; } early_param("nosmt", smt_cmdline_disable); /* * For Archicture supporting partial SMT states check if the thread is allowed. * Otherwise this has already been checked through cpu_smt_max_threads when * setting the SMT level. */ static inline bool cpu_smt_thread_allowed(unsigned int cpu) { #ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC return topology_smt_thread_allowed(cpu); #else return true; #endif } static inline bool cpu_bootable(unsigned int cpu) { if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; /* All CPUs are bootable if controls are not configured */ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) return true; /* All CPUs are bootable if CPU is not SMT capable */ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return true; if (topology_is_primary_thread(cpu)) return true; /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } /* Returns true if SMT is supported and not forcefully (irreversibly) disabled */ bool cpu_smt_possible(void) { return cpu_smt_control != CPU_SMT_FORCE_DISABLED && cpu_smt_control != CPU_SMT_NOT_SUPPORTED; } EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_bootable(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; bool bringup = st->state < target; st->rollback = false; st->last = NULL; st->target = target; st->single = false; st->bringup = bringup; if (cpu_dying(cpu) != !bringup) set_cpu_dying(cpu, !bringup); return prev_state; } static inline void cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) { bool bringup = !st->bringup; st->target = prev_state; /* * Already rolling back. No need invert the bringup value or to change * the current state. */ if (st->rollback) return; st->rollback = true; /* * If we have st->last we need to undo partial multi_instance of this * state first. Otherwise start undo at the previous state. */ if (!st->last) { if (st->bringup) st->state--; else st->state++; } st->bringup = bringup; if (cpu_dying(cpu) != !bringup) set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) { if (!st->single && st->state == st->target) return; st->result = 0; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() */ smp_mb(); st->should_run = true; wake_up_process(st->thread); wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state; int ret; prev_state = cpuhp_set_state(cpu, st, target); __cpuhp_kick_ap(st); if ((ret = st->result)) { cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } return ret; } static int bringup_wait_for_ap_online(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; /* Unpark the hotplug thread of the target cpu */ kthread_unpark(st->thread); /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_bootable() check will now return false if this is not the * primary sibling. */ if (!cpu_bootable(cpu)) return -ECANCELED; return 0; } #ifdef CONFIG_HOTPLUG_SPLIT_STARTUP static int cpuhp_kick_ap_alive(unsigned int cpu) { if (!cpuhp_can_boot_ap(cpu)) return -EAGAIN; return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu)); } static int cpuhp_bringup_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int ret; /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * Prevent irq alloc/free across the bringup. */ irq_lock_sparse(); ret = cpuhp_bp_sync_alive(cpu); if (ret) goto out_unlock; ret = bringup_wait_for_ap_online(cpu); if (ret) goto out_unlock; irq_unlock_sparse(); if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; return cpuhp_kick_ap(cpu, st, st->target); out_unlock: irq_unlock_sparse(); return ret; } #else static int bringup_cpu(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle = idle_thread_get(cpu); int ret; if (!cpuhp_can_boot_ap(cpu)) return -EAGAIN; /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * * Prevent irq alloc/free across the bringup by acquiring the * sparse irq lock. Hold it until the upcoming CPU completes the * startup in cpuhp_online_idle() which allows to avoid * intermediate synchronization points in the architecture code. */ irq_lock_sparse(); ret = __cpu_up(cpu, idle); if (ret) goto out_unlock; ret = cpuhp_bp_sync_alive(cpu); if (ret) goto out_unlock; ret = bringup_wait_for_ap_online(cpu); if (ret) goto out_unlock; irq_unlock_sparse(); if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; return cpuhp_kick_ap(cpu, st, st->target); out_unlock: irq_unlock_sparse(); return ret; } #endif static int finish_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); struct mm_struct *mm = idle->active_mm; /* * sched_force_init_mm() ensured the use of &init_mm, * drop that refcount now that the CPU has stopped. */ WARN_ON(mm != &init_mm); idle->active_mm = NULL; mmdrop_lazy_tlb(mm); return 0; } /* * Hotplug state machine related functions */ /* * Get the next state to run. Empty ones will be skipped. Returns true if a * state must be run. * * st->state will be modified ahead of time, to match state_to_run, as if it * has already ran. */ static bool cpuhp_next_state(bool bringup, enum cpuhp_state *state_to_run, struct cpuhp_cpu_state *st, enum cpuhp_state target) { do { if (bringup) { if (st->state >= target) return false; *state_to_run = ++st->state; } else { if (st->state <= target) return false; *state_to_run = st->state--; } if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run))) break; } while (true); return true; } static int __cpuhp_invoke_callback_range(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target, bool nofail) { enum cpuhp_state state; int ret = 0; while (cpuhp_next_state(bringup, &state, st, target)) { int err; err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL); if (!err) continue; if (nofail) { pr_warn("CPU %u %s state %s (%d) failed (%d)\n", cpu, bringup ? "UP" : "DOWN", cpuhp_get_step(st->state)->name, st->state, err); ret = -1; } else { ret = err; break; } } return ret; } static inline int cpuhp_invoke_callback_range(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false); } static inline void cpuhp_invoke_callback_range_nofail(bool bringup, unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { __cpuhp_invoke_callback_range(bringup, cpu, st, target, true); } static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) { if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) return true; /* * When CPU hotplug is disabled, then taking the CPU down is not * possible because takedown_cpu() and the architecture and * subsystem specific mechanisms are not available. So the CPU * which would be completely unplugged again needs to stay around * in the current state. */ return st->state <= CPUHP_BRINGUP_CPU; } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; ret = cpuhp_invoke_callback_range(true, cpu, st, target); if (ret) { pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n", ret, cpu, cpuhp_get_step(st->state)->name, st->state); cpuhp_reset_state(cpu, st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, prev_state)); } return ret; } /* * The cpu hotplug threads manage the bringup and teardown of the cpus */ static int cpuhp_should_run(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); return st->should_run; } /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. * * Each invocation of this function by the smpboot thread does a single AP * state callback. * * It has 3 modes of operation: * - single: runs st->cb_state * - up: runs ++st->state, while st->state < st->target * - down: runs st->state--, while st->state > st->target * * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); bool bringup = st->bringup; enum cpuhp_state state; if (WARN_ON_ONCE(!st->should_run)) return; /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); /* * The BP holds the hotplug lock, but we're now running on the AP, * ensure that anybody asserting the lock is held, will actually find * it so. */ lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; st->should_run = false; } else { st->should_run = cpuhp_next_state(bringup, &state, st, st->target); if (!st->should_run) goto end; } WARN_ON_ONCE(!cpuhp_is_ap_state(state)); if (cpuhp_is_atomic_state(state)) { local_irq_disable(); st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); local_irq_enable(); /* * STARTING/DYING must not fail! */ WARN_ON_ONCE(st->result); } else { st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); } if (st->result) { /* * If we fail on a rollback, we're up a creek without no * paddle, no way forward, no way back. We loose, thanks for * playing. */ WARN_ON_ONCE(st->rollback); st->should_run = false; } end: cpuhp_lock_release(bringup); lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int ret; if (!cpu_online(cpu)) return 0; cpuhp_lock_acquire(false); cpuhp_lock_release(false); cpuhp_lock_acquire(true); cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. */ if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->rollback = false; st->last = NULL; st->node = node; st->bringup = bringup; st->cb_state = state; st->single = true; __cpuhp_kick_ap(st); /* * If we failed and did a partial, do a rollback. */ if ((ret = st->result) && st->last) { st->rollback = true; st->bringup = !bringup; __cpuhp_kick_ap(st); } /* * Clean up the leftovers so the next hotplug operation wont use stale * data. */ st->node = st->last = NULL; return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state prev_state = st->state; int ret; cpuhp_lock_acquire(false); cpuhp_lock_release(false); cpuhp_lock_acquire(true); cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(cpu, st, st->target); trace_cpuhp_exit(cpu, st->state, prev_state, ret); return ret; } static struct smp_hotplug_thread cpuhp_threads = { .store = &cpuhp_state.thread, .thread_should_run = cpuhp_should_run, .thread_fn = cpuhp_thread_fun, .thread_comm = "cpuhp/%u", .selfparking = true, }; static __init void cpuhp_init_state(void) { struct cpuhp_cpu_state *st; int cpu; for_each_possible_cpu(cpu) { st = per_cpu_ptr(&cpuhp_state, cpu); init_completion(&st->done_up); init_completion(&st->done_down); } } void __init cpuhp_threads_init(void) { cpuhp_init_state(); BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); kthread_unpark(this_cpu_read(cpuhp_state.thread)); } #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) #endif /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id * * This function walks all processes, finds a valid mm struct for each one and * then clears a corresponding bit in mm's cpumask. While this all sounds * trivial, there are various non-obvious corner cases, which this function * tries to solve in a safe manner. * * Also note that the function uses a somewhat relaxed locking scheme, so it may * be called only for an already offlined CPU. */ void clear_tasks_mm_cpumask(int cpu) { struct task_struct *p; /* * This function is called after the cpu is taken down and marked * offline, so its not like new tasks will ever get this cpu set in * their mm mask. -- Peter Zijlstra * Thus, we may use rcu_read_lock() here, instead of grabbing * full-fledged tasklist_lock. */ WARN_ON(cpu_online(cpu)); rcu_read_lock(); for_each_process(p) { struct task_struct *t; /* * Main thread might exit, but other threads may still have * a valid mm. Find one. */ t = find_lock_task_mm(p); if (!t) continue; arch_clear_mm_cpumask_cpu(cpu, t->mm); task_unlock(t); } rcu_read_unlock(); } /* Take this CPU down. */ static int take_cpu_down(void *_param) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; /* * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going * down, that the current state is CPUHP_TEARDOWN_CPU - 1. */ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1)); /* * Invoke the former CPU_DYING callbacks. DYING must not fail! */ cpuhp_invoke_callback_range_nofail(false, cpu, st, target); /* Park the stopper thread */ stop_machine_park(cpu); return 0; } static int takedown_cpu(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; /* Park the smpboot threads */ kthread_park(st->thread); /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. */ irq_lock_sparse(); /* * So now all preempt/rcu users must observe !cpu_active(). */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); /* Unpark the hotplug thread so we can rollback there */ kthread_unpark(st->thread); return err; } BUG_ON(cpu_online(cpu)); /* * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away. */ wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); cpuhp_bp_sync_dead(cpu); lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu)); /* * Callbacks must be re-integrated right away to the RCU state machine. * Otherwise an RCU callback could block a further teardown function * waiting for its completion. */ rcutree_migrate_callbacks(cpu); return 0; } static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); tick_assert_timekeeping_handover(); rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), cpuhp_complete_idle_dead, st, 0); } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; ret = cpuhp_invoke_callback_range(false, cpu, st, target); if (ret) { pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n", ret, cpu, cpuhp_get_step(st->state)->name, st->state); cpuhp_reset_state(cpu, st, prev_state); if (st->state < prev_state) WARN_ON(cpuhp_invoke_callback_range(true, cpu, st, prev_state)); } return ret; } /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; if (num_online_cpus() == 1) return -EBUSY; if (!cpu_present(cpu)) return -EINVAL; cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; prev_state = cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just * return the error code.. */ if (ret) goto out; /* * We might have stopped still in the range of the AP hotplug * thread. Nothing to do anymore. */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state < prev_state) { if (st->state == CPUHP_TEARDOWN_CPU) { cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } else { WARN(1, "DEAD callback error for CPU%d", cpu); } } out: cpus_write_unlock(); /* * Do post unplug cleanup. This is still protected against * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); arch_smt_update(); return ret; } struct cpu_down_work { unsigned int cpu; enum cpuhp_state target; }; static long __cpu_down_maps_locked(void *arg) { struct cpu_down_work *work = arg; return _cpu_down(work->cpu, 0, work->target); } static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { struct cpu_down_work work = { .cpu = cpu, .target = target, }; /* * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. */ if (cpu_hotplug_offline_disabled) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; /* * Ensure that the control task does not run on the to be offlined * CPU to prevent a deadlock against cfs_b->period_timer. * Also keep at least one housekeeping cpu onlined to avoid generating * an empty sched_domain span. */ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) { if (cpu != work.cpu) return work_on_cpu(cpu, __cpu_down_maps_locked, &work); } return -EBUSY; } static int cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; cpu_maps_update_begin(); err = cpu_down_maps_locked(cpu, target); cpu_maps_update_done(); return err; } /** * cpu_device_down - Bring down a cpu device * @dev: Pointer to the cpu device to offline * * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use remove_cpu() instead. * * Return: %0 on success or a negative errno code */ int cpu_device_down(struct device *dev) { return cpu_down(dev->id, CPUHP_OFFLINE); } int remove_cpu(unsigned int cpu) { int ret; lock_device_hotplug(); ret = device_offline(get_cpu_device(cpu)); unlock_device_hotplug(); return ret; } EXPORT_SYMBOL_GPL(remove_cpu); void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { unsigned int cpu; int error; cpu_maps_update_begin(); /* * Make certain the cpu I'm about to reboot on is online. * * This is inline to what migrate_to_reboot_cpu() already do. */ if (!cpu_online(primary_cpu)) primary_cpu = cpumask_first(cpu_online_mask); for_each_online_cpu(cpu) { if (cpu == primary_cpu) continue; error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (error) { pr_err("Failed to offline CPU%d - error=%d", cpu, error); break; } } /* * Ensure all but the reboot CPU are offline. */ BUG_ON(num_online_cpus() > 1); /* * Make sure the CPUs won't be enabled by someone else after this * point. Kexec will reboot to a new kernel shortly resetting * everything along the way. */ cpu_hotplug_disabled++; cpu_maps_update_done(); } #else #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* * STARTING must not fail! */ cpuhp_invoke_callback_range_nofail(true, cpu, st, target); } /* * Called from the idle task. Wake up the controlling task which brings the * hotplug thread of the upcoming CPU up and then delegates the rest of the * online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE); /* * Unpark the stopper thread before we start the idle loop (and start * scheduling); this ensures the stopper task is always available. */ stop_machine_unpark(smp_processor_id()); st->state = CPUHP_AP_ONLINE_IDLE; complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; int ret = 0; cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } /* * The caller of cpu_up() might have raced with another * caller. Nothing to do. */ if (st->state >= target) goto out; if (st->state == CPUHP_OFFLINE) { /* Let it fail before we try to bring the cpu up */ idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); goto out; } /* * Reset stale stack state from the last time this CPU was online. */ scs_task_reset(idle); kasan_unpoison_task_stack(idle); } cpuhp_tasks_frozen = tasks_frozen; cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. */ if (st->state > CPUHP_BRINGUP_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just * return the error code.. */ if (ret) goto out; } /* * Try to reach the target state. We max out on the BP at * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); arch_smt_update(); return ret; } static int cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", cpu); return -EINVAL; } err = try_online_node(cpu_to_node(cpu)); if (err) return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { err = -EBUSY; goto out; } if (!cpu_bootable(cpu)) { err = -EPERM; goto out; } err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } /** * cpu_device_up - Bring up a cpu device * @dev: Pointer to the cpu device to online * * This function is meant to be used by device core cpu subsystem only. * * Other subsystems should use add_cpu() instead. * * Return: %0 on success or a negative errno code */ int cpu_device_up(struct device *dev) { return cpu_up(dev->id, CPUHP_ONLINE); } int add_cpu(unsigned int cpu) { int ret; lock_device_hotplug(); ret = device_online(get_cpu_device(cpu)); unlock_device_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_cpu); /** * bringup_hibernate_cpu - Bring up the CPU that we hibernated on * @sleep_cpu: The cpu we hibernated on and should be brought up. * * On some architectures like arm64, we can hibernate on any CPU, but on * wake up the CPU we hibernated on might be offline as a side effect of * using maxcpus= for example. * * Return: %0 on success or a negative errno code */ int bringup_hibernate_cpu(unsigned int sleep_cpu) { int ret; if (!cpu_online(sleep_cpu)) { pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n"); ret = cpu_up(sleep_cpu, CPUHP_ONLINE); if (ret) { pr_err("Failed to bring hibernate-CPU up!\n"); return ret; } } return 0; } static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus, enum cpuhp_state target) { unsigned int cpu; for_each_cpu(cpu, mask) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); if (cpu_up(cpu, target) && can_rollback_cpu(st)) { /* * If this failed then cpu_up() might have only * rolled back to CPUHP_BP_KICK_AP for the final * online. Clean it up. NOOP if already rolled back. */ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE)); } if (!--ncpus) break; } } #ifdef CONFIG_HOTPLUG_PARALLEL static bool __cpuhp_parallel_bringup __ro_after_init = true; static int __init parallel_bringup_parse_param(char *arg) { return kstrtobool(arg, &__cpuhp_parallel_bringup); } early_param("cpuhp.parallel", parallel_bringup_parse_param); #ifdef CONFIG_HOTPLUG_SMT static inline bool cpuhp_smt_aware(void) { return cpu_smt_max_threads > 1; } static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_primary_thread_mask; } #else static inline bool cpuhp_smt_aware(void) { return false; } static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_none_mask; } #endif bool __weak arch_cpuhp_init_parallel_bringup(void) { return true; } /* * On architectures which have enabled parallel bringup this invokes all BP * prepare states for each of the to be onlined APs first. The last state * sends the startup IPI to the APs. The APs proceed through the low level * bringup code in parallel and then wait for the control CPU to release * them one by one for the final onlining procedure. * * This avoids waiting for each AP to respond to the startup IPI in * CPUHP_BRINGUP_CPU. */ static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus) { const struct cpumask *mask = cpu_present_mask; if (__cpuhp_parallel_bringup) __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup(); if (!__cpuhp_parallel_bringup) return false; if (cpuhp_smt_aware()) { const struct cpumask *pmask = cpuhp_get_primary_thread_mask(); static struct cpumask tmp_mask __initdata; /* * X86 requires to prevent that SMT siblings stopped while * the primary thread does a microcode update for various * reasons. Bring the primary threads up first. */ cpumask_and(&tmp_mask, mask, pmask); cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP); cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE); /* Account for the online CPUs */ ncpus -= num_online_cpus(); if (!ncpus) return true; /* Create the mask for secondary CPUs */ cpumask_andnot(&tmp_mask, mask, pmask); mask = &tmp_mask; } /* Bring the not-yet started CPUs up */ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP); cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE); return true; } #else static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; } #endif /* CONFIG_HOTPLUG_PARALLEL */ void __init bringup_nonboot_cpus(unsigned int max_cpus) { if (!max_cpus) return; /* Try parallel bringup optimization if enabled */ if (cpuhp_bringup_cpus_parallel(max_cpus)) return; /* Full per CPU serialized bringup */ cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE); } #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; int freeze_secondary_cpus(int primary) { int cpu, error = 0; cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); } /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); pr_info("Disabling non-boot CPUs ...\n"); for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) { if (!cpu_online(cpu) || cpu == primary) continue; if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; } trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } if (!error) BUG_ON(num_online_cpus() > 1); else pr_err("Non-boot CPUs are not disabled\n"); /* * Make sure the CPUs won't be enabled by someone else. We need to do * this even in case of failure as all freeze_secondary_cpus() users are * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; cpu_maps_update_done(); return error; } void __weak arch_thaw_secondary_cpus_begin(void) { } void __weak arch_thaw_secondary_cpus_end(void) { } void thaw_secondary_cpus(void) { int cpu, error; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; pr_info("Enabling non-boot CPUs ...\n"); arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); continue; } pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); } static int __init alloc_frozen_cpus(void) { if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) return -ENOMEM; return 0; } core_initcall(alloc_frozen_cpus); /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the * duration* of the execution of the callbacks. * Hence we need to prevent the freezer from racing with regular CPU hotplug. * * This synchronization is implemented by mutually excluding regular CPU * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ * Hibernate notifications. */ static int cpu_hotplug_pm_callback(struct notifier_block *nb, unsigned long action, void *ptr) { switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: cpu_hotplug_enable(); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static int __init cpu_hotplug_pm_sync_init(void) { /* * cpu_hotplug_pm_callback has higher priority than x86 * bsp_pm_callback which depends on cpu_hotplug_pm_callback * to disable cpu hotplug to avoid cpu hotplug race. */ pm_notifier(cpu_hotplug_pm_callback, 0); return 0; } core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ int __boot_cpu_id; #endif /* CONFIG_SMP */ /* Boot processor state steps */ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", .startup.single = NULL, .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { .name = "threads:prepare", .startup.single = smpboot_create_threads, .teardown.single = NULL, .cant_stop = true, }, [CPUHP_PERF_PREPARE] = { .name = "perf:prepare", .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, .teardown.single = NULL, }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", .startup.single = smpcfd_prepare_cpu, .teardown.single = smpcfd_dead_cpu, }, [CPUHP_RELAY_PREPARE] = { .name = "relay:prepare", .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, .teardown.single = rcutree_dead_cpu, }, /* * On the tear-down path, timers_dead_cpu() must be invoked * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_PREPARE] = { .name = "timers:prepare", .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, #ifdef CONFIG_HOTPLUG_SPLIT_STARTUP /* * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until * the next step will release it. */ [CPUHP_BP_KICK_AP] = { .name = "cpu:kick_ap", .startup.single = cpuhp_kick_ap_alive, }, /* * Waits for the AP to reach cpuhp_ap_sync_alive() and then * releases it for the complete bringup. */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = cpuhp_bringup_ap, .teardown.single = finish_cpu, .cant_stop = true, }, #else /* * All-in-one CPU bringup state which includes the kick alive. */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, .teardown.single = finish_cpu, .cant_stop = true, }, #endif /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", }, /* * Last state before CPU enters the idle loop to die. Transient state * for synchronization. */ [CPUHP_AP_OFFLINE] = { .name = "ap:offline", .cant_stop = true, }, /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", .startup.single = sched_cpu_starting, .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { .name = "RCU/tree:dying", .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, [CPUHP_AP_SMPCFD_DYING] = { .name = "smpcfd:dying", .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, [CPUHP_AP_HRTIMERS_DYING] = { .name = "hrtimers:dying", .startup.single = hrtimers_cpu_starting, .teardown.single = hrtimers_cpu_dying, }, [CPUHP_AP_TICK_DYING] = { .name = "tick:dying", .startup.single = NULL, .teardown.single = tick_cpu_dying, }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { .name = "ap:online", }, /* * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup.single = NULL, .teardown.single = takedown_cpu, .cant_stop = true, }, [CPUHP_AP_SCHED_WAIT_EMPTY] = { .name = "sched:waitempty", .startup.single = NULL, .teardown.single = sched_cpu_wait_empty, }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, .teardown.single = smpboot_park_threads, }, [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { .name = "irq/affinity:online", .startup.single = irq_affinity_online_cpu, .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { .name = "perf:online", .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WATCHDOG_ONLINE] = { .name = "lockup_detector:online", .startup.single = lockup_detector_online_cpu, .teardown.single = lockup_detector_offline_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RANDOM_ONLINE] = { .name = "random:online", .startup.single = random_online_cpu, .teardown.single = NULL, }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, .teardown.single = rcutree_offline_cpu, }, #endif /* * The dynamically registered state space is here */ #ifdef CONFIG_SMP /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", .startup.single = sched_cpu_activate, .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", .startup.single = NULL, .teardown.single = NULL, }, }; /* Sanity check for callbacks */ static int cpuhp_cb_check(enum cpuhp_state state) { if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) return -EINVAL; return 0; } /* * Returns a free for dynamic slot assignment of the Online state. The states * are protected by the cpuhp_slot_states mutex and an empty slot is identified * by having no name assigned. */ static int cpuhp_reserve_state(enum cpuhp_state state) { enum cpuhp_state i, end; struct cpuhp_step *step; switch (state) { case CPUHP_AP_ONLINE_DYN: step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; end = CPUHP_AP_ONLINE_DYN_END; break; case CPUHP_BP_PREPARE_DYN: step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; end = CPUHP_BP_PREPARE_DYN_END; break; default: return -EINVAL; } for (i = state; i <= end; i++, step++) { if (!step->name) return i; } WARN(1, "No more dynamic states available for CPU hotplug\n"); return -ENOSPC; } static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; int ret = 0; /* * If name is NULL, then the state gets removed. * * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on * the first allocation from these dynamic ranges, so the removal * would trigger a new allocation and clear the wrong (already * empty) state, leaving the callbacks of the to be cleared state * dangling, which causes wreckage on the next hotplug operation. */ if (name && (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN)) { ret = cpuhp_reserve_state(state); if (ret < 0) return ret; state = ret; } sp = cpuhp_get_step(state); if (name && sp->name) return -EBUSY; sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); return ret; } static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { return cpuhp_get_step(state)->teardown.single; } /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_step *sp = cpuhp_get_step(state); int ret; /* * If there's nothing to do, we done. * Relies on the union for multi_instance. */ if (cpuhp_step_empty(bringup, sp)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown * e.g. module removal we crash for now. */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; } /* * Called from __cpuhp_setup_state on a recoverable failure. * * Note: The teardown callbacks for rollback are not allowed to fail! */ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, struct hlist_node *node) { int cpu; /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpu >= failedcpu) break; /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) cpuhp_issue_call(cpu, state, false, node); } } int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, struct hlist_node *node, bool invoke) { struct cpuhp_step *sp; int cpu; int ret; lockdep_assert_cpus_held(); sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) goto add_node; /* * Try to call the startup callback for each present cpu * depending on the hotplug state of the cpu. */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate < state) continue; ret = cpuhp_issue_call(cpu, state, true, node); if (ret) { if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); goto unlock; } } add_node: ret = 0; hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); return ret; } int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke) { int ret; cpus_read_lock(); ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @name: Name of the step * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state * @startup: startup callback function * @teardown: teardown callback function * @multi_instance: State is set up for multiple instances which get * added afterwards. * * The caller needs to hold cpus read locked while calling this function. * Return: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN; * 0 for all other states * On failure: proper (negative) error code */ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { int cpu, ret = 0; bool dynstate; lockdep_assert_cpus_held(); if (cpuhp_cb_check(state) || !name) return -EINVAL; mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN; if (ret > 0 && dynstate) { state = ret; ret = 0; } if (ret || !invoke || !startup) goto out; /* * Try to call the startup callback for each present cpu * depending on the hotplug state of the cpu. */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate < state) continue; ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { if (teardown) cpuhp_rollback_install(cpu, state, NULL); cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } out: mutex_unlock(&cpuhp_state_mutex); /* * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN, * return the dynamically allocated state in case of success. */ if (!ret && dynstate) return state; return ret; } EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance) { int ret; cpus_read_lock(); ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, teardown, multi_instance); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); if (!sp->multi_instance) return -EINVAL; cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* * Call the teardown callback for each present cpu depending * on the hotplug state of the cpu. This function is not * allowed to fail currently! */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate >= state) cpuhp_issue_call(cpu, state, false, node); } remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); cpus_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { WARN(!hlist_empty(&sp->list), "Error: Removing state %d which has instances left.\n", state); goto remove; } if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* * Call the teardown callback for each present cpu depending * on the hotplug state of the cpu. This function is not * allowed to fail currently! */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int cpustate = st->state; if (cpustate >= state) cpuhp_issue_call(cpu, state, false, NULL); } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); } EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { cpus_read_lock(); __cpuhp_remove_state_cpuslocked(state, invoke); cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); #ifdef CONFIG_HOTPLUG_SMT static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); dev->offline = true; /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } static void cpuhp_online_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); dev->offline = false; /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_ONLINE); } int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; cpu_maps_update_begin(); for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; /* * Disable can be called with CPU_SMT_ENABLED when changing * from a higher to lower number of SMT threads per core. */ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; /* * As this needs to hold the cpu maps lock it's impossible * to call device_offline() because that ends up calling * cpu_down() which takes cpu maps lock. cpu maps lock * needs to be held as this might race against in kernel * abusers of the hotplug machinery (thermal management). * * So nothing would update device:offline state. That would * leave the sysfs entry stale and prevent onlining after * smt control has been changed to 'off' again. This is * called under the sysfs hotplug lock, so it is properly * serialized against the regular offline usage. */ cpuhp_offline_cpu_device(cpu); } if (!ret) cpu_smt_control = ctrlval; cpu_maps_update_done(); return ret; } /* Check if the core a CPU belongs to is online */ #if !defined(topology_is_core_online) static inline bool topology_is_core_online(unsigned int cpu) { return true; } #endif int cpuhp_smt_enable(void) { int cpu, ret = 0; cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; for_each_present_cpu(cpu) { /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; /* See comment in cpuhp_smt_disable() */ cpuhp_online_cpu_device(cpu); } cpu_maps_update_done(); return ret; } #endif #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->state); } static DEVICE_ATTR_RO(state); static ssize_t target_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; int target, ret; ret = kstrtoint(buf, 10, &target); if (ret) return ret; #ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) return -EINVAL; #else if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) return -EINVAL; #endif ret = lock_device_hotplug_sysfs(); if (ret) return ret; mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(target); ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) goto out; if (st->state < target) ret = cpu_up(dev->id, target); else if (st->state > target) ret = cpu_down(dev->id, target); else if (WARN_ON(st->target != target)) st->target = target; out: unlock_device_hotplug(); return ret ? ret : count; } static ssize_t target_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->target); } static DEVICE_ATTR_RW(target); static ssize_t fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); struct cpuhp_step *sp; int fail, ret; ret = kstrtoint(buf, 10, &fail); if (ret) return ret; if (fail == CPUHP_INVALID) { st->fail = fail; return count; } if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) return -EINVAL; /* * Cannot fail STARTING/DYING callbacks. */ if (cpuhp_is_atomic_state(fail)) return -EINVAL; /* * DEAD callbacks cannot fail... * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter * triggering STARTING callbacks, a failure in this state would * hinder rollback. */ if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU) return -EINVAL; /* * Cannot fail anything that doesn't have callbacks. */ mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(fail); if (!sp->startup.single && !sp->teardown.single) ret = -EINVAL; mutex_unlock(&cpuhp_state_mutex); if (ret) return ret; st->fail = fail; return count; } static ssize_t fail_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); return sprintf(buf, "%d\n", st->fail); } static DEVICE_ATTR_RW(fail); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, &dev_attr_fail.attr, NULL }; static const struct attribute_group cpuhp_cpu_attr_group = { .attrs = cpuhp_cpu_attrs, .name = "hotplug", }; static ssize_t states_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t cur, res = 0; int i; mutex_lock(&cpuhp_state_mutex); for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { struct cpuhp_step *sp = cpuhp_get_step(i); if (sp->name) { cur = sprintf(buf, "%3d: %s\n", i, sp->name); buf += cur; res += cur; } } mutex_unlock(&cpuhp_state_mutex); return res; } static DEVICE_ATTR_RO(states); static struct attribute *cpuhp_cpu_root_attrs[] = { &dev_attr_states.attr, NULL }; static const struct attribute_group cpuhp_cpu_root_attr_group = { .attrs = cpuhp_cpu_root_attrs, .name = "hotplug", }; #ifdef CONFIG_HOTPLUG_SMT static bool cpu_smt_num_threads_valid(unsigned int threads) { if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC)) return threads >= 1 && threads <= cpu_smt_max_threads; return threads == 1 || threads == cpu_smt_max_threads; } static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ctrlval, ret, num_threads, orig_threads; bool force_off; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) return -EPERM; if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return -ENODEV; if (sysfs_streq(buf, "on")) { ctrlval = CPU_SMT_ENABLED; num_threads = cpu_smt_max_threads; } else if (sysfs_streq(buf, "off")) { ctrlval = CPU_SMT_DISABLED; num_threads = 1; } else if (sysfs_streq(buf, "forceoff")) { ctrlval = CPU_SMT_FORCE_DISABLED; num_threads = 1; } else if (kstrtoint(buf, 10, &num_threads) == 0) { if (num_threads == 1) ctrlval = CPU_SMT_DISABLED; else if (cpu_smt_num_threads_valid(num_threads)) ctrlval = CPU_SMT_ENABLED; else return -EINVAL; } else { return -EINVAL; } ret = lock_device_hotplug_sysfs(); if (ret) return ret; orig_threads = cpu_smt_num_threads; cpu_smt_num_threads = num_threads; force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED; if (num_threads > orig_threads) ret = cpuhp_smt_enable(); else if (num_threads < orig_threads || force_off) ret = cpuhp_smt_disable(ctrlval); unlock_device_hotplug(); return ret ? ret : count; } #else /* !CONFIG_HOTPLUG_SMT */ static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return -ENODEV; } #endif /* CONFIG_HOTPLUG_SMT */ static const char *smt_states[] = { [CPU_SMT_ENABLED] = "on", [CPU_SMT_DISABLED] = "off", [CPU_SMT_FORCE_DISABLED] = "forceoff", [CPU_SMT_NOT_SUPPORTED] = "notsupported", [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", }; static ssize_t control_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *state = smt_states[cpu_smt_control]; #ifdef CONFIG_HOTPLUG_SMT /* * If SMT is enabled but not all threads are enabled then show the * number of threads. If all threads are enabled show "on". Otherwise * show the state name. */ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_num_threads != cpu_smt_max_threads) return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); #endif return sysfs_emit(buf, "%s\n", state); } static ssize_t control_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return __store_smt_control(dev, attr, buf, count); } static DEVICE_ATTR_RW(control); static ssize_t active_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", sched_smt_active()); } static DEVICE_ATTR_RO(active); static struct attribute *cpuhp_smt_attrs[] = { &dev_attr_control.attr, &dev_attr_active.attr, NULL }; static const struct attribute_group cpuhp_smt_attr_group = { .attrs = cpuhp_smt_attrs, .name = "smt", }; static int __init cpu_smt_sysfs_init(void) { struct device *dev_root; int ret = -ENODEV; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group); put_device(dev_root); } return ret; } static int __init cpuhp_sysfs_init(void) { struct device *dev_root; int cpu, ret; ret = cpu_smt_sysfs_init(); if (ret) return ret; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group); put_device(dev_root); if (ret) return ret; } for_each_possible_cpu(cpu) { struct device *dev = get_cpu_device(cpu); if (!dev) continue; ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); if (ret) return ret; } return 0; } device_initcall(cpuhp_sysfs_init); #endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1<<nr. * * It is used by cpumask_of() to get a constant address to a CPU * mask value that has a single bit set only. */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { MASK_DECLARE_8(0), MASK_DECLARE_8(8), MASK_DECLARE_8(16), MASK_DECLARE_8(24), #if BITS_PER_LONG > 32 MASK_DECLARE_8(32), MASK_DECLARE_8(40), MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; EXPORT_SYMBOL_GPL(cpu_bit_bitmap); const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; #else struct cpumask __cpu_possible_mask __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); struct cpumask __cpu_enabled_mask __read_mostly; EXPORT_SYMBOL(__cpu_enabled_mask); struct cpumask __cpu_present_mask __read_mostly; EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); struct cpumask __cpu_dying_mask __read_mostly; EXPORT_SYMBOL(__cpu_dying_mask); atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus); void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); } void set_cpu_online(unsigned int cpu, bool online) { /* * atomic_inc/dec() is required to handle the horrid abuse of this * function by the reboot and kexec code which invoke it from * IPI/NMI broadcasts when shutting down CPUs. Invocation from * regular CPU hotplug is properly serialized. * * Note, that the fact that __num_online_cpus is of type atomic_t * does not protect readers which are not serialized against * concurrent hotplug operations. */ if (online) { if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) atomic_inc(&__num_online_cpus); } else { if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) atomic_dec(&__num_online_cpus); } } /* * Activate the first processor. */ void __init boot_cpu_init(void) { int cpu = smp_processor_id(); /* Mark the boot cpu "present", "online" etc for SMP and UP case */ set_cpu_online(cpu, true); set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); #ifdef CONFIG_SMP __boot_cpu_id = cpu; #endif } /* * Must be called _AFTER_ setting up the per_cpu areas */ void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } #ifdef CONFIG_CPU_MITIGATIONS /* * These are used for a global "mitigations=" cmdline option for toggling * optional CPU mitigations. */ enum cpu_mitigations { CPU_MITIGATIONS_OFF, CPU_MITIGATIONS_AUTO, CPU_MITIGATIONS_AUTO_NOSMT, }; static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { if (!strcmp(arg, "off")) cpu_mitigations = CPU_MITIGATIONS_OFF; else if (!strcmp(arg, "auto")) cpu_mitigations = CPU_MITIGATIONS_AUTO; else if (!strcmp(arg, "auto,nosmt")) cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; else pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg); return 0; } /* mitigations=off */ bool cpu_mitigations_off(void) { return cpu_mitigations == CPU_MITIGATIONS_OFF; } EXPORT_SYMBOL_GPL(cpu_mitigations_off); /* mitigations=auto,nosmt */ bool cpu_mitigations_auto_nosmt(void) { return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; } EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); #else static int __init mitigations_parse_cmdline(char *arg) { pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n"); return 0; } #endif early_param("mitigations", mitigations_parse_cmdline); |
1510 334 186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 Huawei Ltd. * Author: Jiang Liu <liuj97@gmail.com> * * Based on arch/arm/include/asm/jump_label.h */ #ifndef __ASM_JUMP_LABEL_H #define __ASM_JUMP_LABEL_H #ifndef __ASSEMBLY__ #include <linux/types.h> #include <asm/insn.h> #define HAVE_JUMP_LABEL_BATCH #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\"\n\t" \ ".align 3\n\t" \ ".long 1b - ., " label " - .\n\t" \ ".quad " key " - .\n\t" \ ".popsection\n\t" /* This macro is also expanded on the Rust side. */ #define ARCH_STATIC_BRANCH_ASM(key, label) \ "1: nop\n\t" \ JUMP_TABLE_ENTRY(key, label) static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { char *k = &((char *)key)[branch]; asm goto( ARCH_STATIC_BRANCH_ASM("%c0", "%l[l_yes]") : : "i"(k) : : l_yes ); return false; l_yes: return true; } static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { char *k = &((char *)key)[branch]; asm goto( "1: b %l[l_yes] \n\t" JUMP_TABLE_ENTRY("%c0", "%l[l_yes]") : : "i"(k) : : l_yes ); return false; l_yes: return true; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_JUMP_LABEL_H */ |
280 33 2 60 267 267 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer #if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> #include <linux/timer.h> DECLARE_EVENT_CLASS(timer_class, TP_PROTO(struct timer_list *timer), TP_ARGS(timer), TP_STRUCT__entry( __field( void *, timer ) ), TP_fast_assign( __entry->timer = timer; ), TP_printk("timer=%p", __entry->timer) ); /** * timer_init - called when the timer is initialized * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_init, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_timer_flags(flags) \ __print_flags(flags, "|", \ { TIMER_MIGRATING, "M" }, \ { TIMER_DEFERRABLE, "D" }, \ { TIMER_PINNED, "P" }, \ { TIMER_IRQSAFE, "I" }) /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list * @bucket_expiry: the bucket expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long bucket_expiry), TP_ARGS(timer, bucket_expiry), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, bucket_expiry ) __field( unsigned long, now ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; __entry->expires = timer->expires; __entry->bucket_expiry = bucket_expiry; __entry->now = jiffies; __entry->flags = timer->flags; ), TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** * timer_expire_entry - called immediately before the timer callback * @timer: pointer to struct timer_list * @baseclk: value of timer_base::clk when timer expires * * Allows to determine the timer latency. */ TRACE_EVENT(timer_expire_entry, TP_PROTO(struct timer_list *timer, unsigned long baseclk), TP_ARGS(timer, baseclk), TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) __field( void *, function) __field( unsigned long, baseclk ) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; __entry->function = timer->function; __entry->baseclk = baseclk; ), TP_printk("timer=%p function=%ps now=%lu baseclk=%lu", __entry->timer, __entry->function, __entry->now, __entry->baseclk) ); /** * timer_expire_exit - called immediately after the timer callback returns * @timer: pointer to struct timer_list * * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); /** * timer_cancel - called when the timer is canceled * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_cancel, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); TRACE_EVENT(timer_base_idle, TP_PROTO(bool is_idle, unsigned int cpu), TP_ARGS(is_idle, cpu), TP_STRUCT__entry( __field( bool, is_idle ) __field( unsigned int, cpu ) ), TP_fast_assign( __entry->is_idle = is_idle; __entry->cpu = cpu; ), TP_printk("is_idle=%d cpu=%d", __entry->is_idle, __entry->cpu) ); #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \ { CLOCK_TAI, "CLOCK_TAI" }) #define decode_hrtimer_mode(mode) \ __print_symbolic(mode, \ { HRTIMER_MODE_ABS, "ABS" }, \ { HRTIMER_MODE_REL, "REL" }, \ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_init - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_init, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), TP_ARGS(hrtimer, clockid, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( clockid_t, clockid ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->clockid = clockid; __entry->mode = mode; ), TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer, decode_clockid(__entry->clockid), decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_start, TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), TP_ARGS(hrtimer, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( void *, function ) __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->function = hrtimer->function; __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " "mode=%s", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer * @now: pointer to variable which contains current time of the * timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), TP_ARGS(hrtimer, now), TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; __entry->function = hrtimer->function; ), TP_printk("hrtimer=%p function=%ps now=%llu", __entry->hrtimer, __entry->function, (unsigned long long) __entry->now) ); DECLARE_EVENT_CLASS(hrtimer_class, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer), TP_STRUCT__entry( __field( void *, hrtimer ) ), TP_fast_assign( __entry->hrtimer = hrtimer; ), TP_printk("hrtimer=%p", __entry->hrtimer) ); /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * * When used in combination with the hrtimer_expire_entry tracepoint we can * determine the runtime of the callback function. */ DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * hrtimer_cancel - called when the hrtimer is canceled * @hrtimer: pointer to struct hrtimer */ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer * @value: the itimers value, itimer is canceled if value->it_value is * zero, otherwise it is started * @expires: the itimers expiry time */ TRACE_EVENT(itimer_state, TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), TP_STRUCT__entry( __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) __field( long, value_nsec ) __field( long, interval_sec ) __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; __entry->interval_nsec = value->it_interval.tv_nsec; ), TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** * itimer_expire - called when itimer expires * @which: type of the interval timer * @pid: pid of the process which owns the timer * @now: current time, used to calculate the latency of itimer */ TRACE_EVENT(itimer_expire, TP_PROTO(int which, struct pid *pid, unsigned long long now), TP_ARGS(which, pid, now), TP_STRUCT__entry( __field( int , which ) __field( pid_t, pid ) __field( unsigned long long, now ) ), TP_fast_assign( __entry->which = which; __entry->now = now; __entry->pid = pid_nr(pid); ), TP_printk("which=%d pid=%d now=%llu", __entry->which, (int) __entry->pid, __entry->now) ); #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ tick_dep_name(RCU) \ tick_dep_name_end(RCU_EXP) #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end /* The MASK will convert to their bits and they need to be processed too */ #define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); #define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); /* NONE only has a mask defined for it */ #define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ __print_symbolic(val, TICK_DEP_NAMES) TRACE_EVENT(tick_stop, TP_PROTO(int success, int dependency), TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) __field( int , dependency ) ), TP_fast_assign( __entry->success = success; __entry->dependency = dependency; ), TP_printk("success=%d dependency=%s", __entry->success, \ show_tick_dep_name(__entry->dependency)) ); #endif #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */ |
698 698 700 697 701 699 699 698 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> #include <linux/pgalloc_tag.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_MEM_ALLOC_PROFILING &page_alloc_tagging_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG /* * To ensure correct allocation tagging for pages, page_ext should be available * before the first page allocation. Otherwise early task stacks will be * allocated before page_ext initialization and missing tags will be flagged. */ bool early_page_ext __meminitdata = true; #else bool early_page_ext __meminitdata; #endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) kmemleak_alloc(addr, size, 1, flags); else addr = vzalloc_node(size, nid); if (addr) memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { size_t table_size; struct page *page; table_size = page_ext_size * PAGES_PER_SECTION; memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); if (is_vmalloc_addr(addr)) { vfree(addr); } else { page = virt_to_page(addr); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); } |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/vmalloc.h> #include <linux/rhashtable.h> #include <linux/audit.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) #define NFT_SET_MAX_ANONLEN 16 /* limit compaction to avoid huge kmalloc/krealloc sizes. */ #define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, NFT_VALIDATE_NEED, NFT_VALIDATE_DO, }; static struct rhltable nft_objname_ht; static u32 nft_chain_hash(const void *data, u32 len, u32 seed); static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed); static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *); static u32 nft_objname_hash(const void *data, u32 len, u32 seed); static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed); static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *); static const struct rhashtable_params nft_chain_ht_params = { .head_offset = offsetof(struct nft_chain, rhlhead), .key_offset = offsetof(struct nft_chain, name), .hashfn = nft_chain_hash, .obj_hashfn = nft_chain_hash_obj, .obj_cmpfn = nft_chain_hash_cmp, .automatic_shrinking = true, }; static const struct rhashtable_params nft_objname_ht_params = { .head_offset = offsetof(struct nft_object, rhlhead), .key_offset = offsetof(struct nft_object, key), .hashfn = nft_objname_hash, .obj_hashfn = nft_objname_hash_obj, .obj_cmpfn = nft_objname_hash_cmp, .automatic_shrinking = true, }; struct nft_audit_data { struct nft_table *table; int entries; int op; struct list_head list; }; static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER, [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER, [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER, [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER, [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER, [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER, [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER, [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER, [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER, [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER, [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER, [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID, [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER, [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER, [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET, [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER, [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID, [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET, }; static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state) { switch (table->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; case NFT_VALIDATE_NEED: break; case NFT_VALIDATE_DO: if (new_validate_state == NFT_VALIDATE_NEED) return; } table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_trans_gc_work(struct work_struct *work); static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, u8 family, struct nft_table *table, struct nft_chain *chain, const struct nlattr * const *nla) { ctx->net = net; ctx->family = family; ctx->level = 0; ctx->table = table; ctx->chain = chain; ctx->nla = nla; ctx->portid = NETLINK_CB(skb).portid; ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, int msg_type, u32 size, gfp_t gfp) { struct nft_trans *trans; trans = kzalloc(size, gfp); if (trans == NULL) return NULL; INIT_LIST_HEAD(&trans->list); trans->msg_type = msg_type; trans->net = ctx->net; trans->table = ctx->table; trans->seq = ctx->seq; trans->flags = ctx->flags; trans->report = ctx->report; return trans; } static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, int msg_type, u32 size) { return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: case NFT_MSG_NEWSET: return container_of(trans, struct nft_trans_binding, nft_trans); } return NULL; } static void nft_trans_list_del(struct nft_trans *trans) { struct nft_trans_binding *trans_binding; list_del(&trans->list); trans_binding = nft_trans_get_binding(trans); if (trans_binding) list_del(&trans_binding->binding_list); } static void nft_trans_destroy(struct nft_trans *trans) { nft_trans_list_del(trans); kfree(trans); } static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) nft_trans_elem_set_bound(trans) = bind; break; } } } static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) { return __nft_set_trans_bind(ctx, set, true); } static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) { return __nft_set_trans_bind(ctx, set, false); } static void __nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_chain_binding(chain)) return; nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (nft_trans_chain(trans) == chain) nft_trans_chain_bound(trans) = bind; break; case NFT_MSG_NEWRULE: if (nft_trans_rule_chain(trans) == chain) nft_trans_rule_bound(trans) = bind; break; } } } static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) { __nft_chain_trans_bind(ctx, chain, true); } int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { if (!nft_chain_binding(chain)) return 0; if (nft_chain_binding(ctx->chain)) return -EOPNOTSUPP; if (chain->bound) return -EBUSY; if (!nft_use_inc(&chain->use)) return -EMFILE; chain->bound = true; nft_chain_trans_bind(ctx, chain); return 0; } void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { __nft_chain_trans_bind(ctx, chain, false); } static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { err = nf_register_net_hook(net, &hook->ops); if (err < 0) goto err_register; j++; } return 0; err_register: list_for_each_entry(hook, hook_list, list) { if (j-- <= 0) break; nf_unregister_net_hook(net, &hook->ops); } return err; } static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); } } } static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return 0; basechain = nft_base_chain(chain); ops = &basechain->ops; if (basechain->type->ops_register) return basechain->type->ops_register(net, ops); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) return nft_netdev_register_hooks(net, &basechain->hook_list); return nf_register_net_hook(net, &basechain->ops); } static void __nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain, bool release_netdev) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return; basechain = nft_base_chain(chain); ops = &basechain->ops; if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) nft_netdev_unregister_hooks(net, &basechain->hook_list, release_netdev); else nf_unregister_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { return __nf_tables_unregister_hook(net, table, chain, false); } static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) { /* NB: the ->bound equality check is defensive, at this time we only merge * a new nft_trans_elem transaction request with the transaction tail * element, but a->bound != b->bound would imply a NEWRULE transaction * is queued in-between. * * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents * huge krealloc() requests. */ return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; } static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, struct nft_trans_elem *trans, gfp_t gfp) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; if (!nft_trans_collapse_set_elem_allowed(tail, trans)) return false; /* "cannot happen", at this time userspace element add * requests always allocate a new transaction element. * * This serves as a reminder to adjust the list_add_tail * logic below in case this ever changes. */ if (WARN_ON_ONCE(trans->nelems != 1)) return false; if (check_add_overflow(old_nelems, trans->nelems, &nelems)) return false; /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); if (!new_trans) { list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); return false; } /* * new_trans->nft_trans.list contains garbage, but * list_add_tail() doesn't care. */ new_trans->nelems = nelems; new_trans->elems[old_nelems] = trans->elems[0]; list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); return true; } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, struct nft_trans *trans, gfp_t gfp) { struct nft_trans *tail; if (list_empty(&nft_net->commit_list)) return false; tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); if (tail->msg_type != trans->msg_type) return false; switch (trans->msg_type) { case NFT_MSG_NEWSETELEM: case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), nft_trans_container_elem(trans), gfp); } return false; } static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans_binding *binding; struct nft_trans_set *trans_set; list_add_tail(&trans->list, &nft_net->commit_list); binding = nft_trans_get_binding(trans); if (!binding) return; switch (trans->msg_type) { case NFT_MSG_NEWSET: trans_set = nft_trans_container_set(trans); if (!nft_trans_set_update(trans) && nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&binding->binding_list, &nft_net->binding_list); list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); break; case NFT_MSG_NEWCHAIN: if (!nft_trans_chain_update(trans) && nft_chain_binding(nft_trans_chain(trans))) list_add_tail(&binding->binding_list, &nft_net->binding_list); break; } } static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); might_alloc(gfp); if (nft_trans_try_collapse(nft_net, trans, gfp)) { kfree(trans); return; } nft_trans_commit_list_add_tail(net, trans); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); if (trans == NULL) return -ENOMEM; if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_deltable(struct nft_ctx *ctx) { int err; err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE); if (err < 0) return err; nft_deactivate_next(ctx->net, ctx->table); return err; } static struct nft_trans * nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type) { struct nft_trans_chain *trans_chain; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); if (!trans) return NULL; trans_chain = nft_trans_container_chain(trans); INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list); trans_chain->chain = ctx->chain; return trans; } static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; trans = nft_trans_alloc_chain(ctx, msg_type); if (trans == NULL) return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); if (ctx->nla[NFTA_CHAIN_ID]) { nft_trans_chain_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delchain(struct nft_ctx *ctx) { struct nft_trans *trans; trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); if (IS_ERR(trans)) return PTR_ERR(trans); nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->activate) expr->ops->activate(ctx, expr); expr = nft_expr_next(expr); } } void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase) { struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->deactivate) expr->ops->deactivate(ctx, expr, phase); expr = nft_expr_next(expr); } } static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; } static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, struct nft_rule *rule) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); if (trans == NULL) return NULL; if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) { nft_trans_rule_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; nft_trans_rule_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_flow_rule *flow; struct nft_trans *trans; int err; trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); if (trans == NULL) return -ENOMEM; if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) { flow = nft_flow_rule_create(ctx->net, rule); if (IS_ERR(flow)) { nft_trans_destroy(trans); return PTR_ERR(flow); } nft_trans_flow_rule(trans) = flow; } err = nf_tables_delrule_deactivate(ctx, rule); if (err < 0) { nft_trans_destroy(trans); return err; } nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE); return 0; } static int nft_delrule_by_chain(struct nft_ctx *ctx) { struct nft_rule *rule; int err; list_for_each_entry(rule, &ctx->chain->rules, list) { if (!nft_is_active_next(ctx->net, rule)) continue; err = nft_delrule(ctx, rule); if (err < 0) return err; } return 0; } static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set, const struct nft_set_desc *desc) { struct nft_trans_set *trans_set; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); if (trans == NULL) return -ENOMEM; trans_set = nft_trans_container_set(trans); INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); INIT_LIST_HEAD(&trans_set->list_trans_newset); if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; if (desc) { nft_trans_set_update(trans) = true; nft_trans_set_gc_int(trans) = desc->gc_int; nft_trans_set_timeout(trans) = desc->timeout; nft_trans_set_size(trans) = desc->size; } nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { return __nft_trans_set_add(ctx, msg_type, set, NULL); } static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (!nft_set_elem_active(ext, iter->genmask)) return 0; nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; } struct nft_set_elem_catchall { struct list_head list; struct rcu_head rcu; struct nft_elem_priv *elem; }; static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } } static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; set->ops->walk(ctx, set, &iter); WARN_ON_ONCE(iter.err); nft_map_catchall_deactivate(ctx, set); } static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set); if (err < 0) return err; if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); nft_use_dec(&ctx->table->use); return err; } static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, struct nft_object *obj) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj)); if (trans == NULL) return -ENOMEM; if (msg_type == NFT_MSG_NEWOBJ) nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) { int err; err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj); if (err < 0) return err; nft_deactivate_next(ctx->net, obj); nft_use_dec(&ctx->table->use); return err; } static struct nft_trans * nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); if (IS_ERR(trans)) return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); nft_use_dec(&ctx->table->use); return 0; } static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) { int i; for (i = track->regs[dreg].num_reg; i > 0; i--) __nft_reg_track_cancel(track, dreg - i); } static void __nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 num_reg) { track->regs[dreg].selector = expr; track->regs[dreg].bitwise = NULL; track->regs[dreg].num_reg = num_reg; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len) { unsigned int regcount; int i; __nft_reg_track_clobber(track, dreg); regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); for (i = 0; i < regcount; i++, dreg++) __nft_reg_track_update(track, expr, dreg, i); } EXPORT_SYMBOL_GPL(nft_reg_track_update); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len) { unsigned int regcount; int i; __nft_reg_track_clobber(track, dreg); regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE); for (i = 0; i < regcount; i++, dreg++) __nft_reg_track_cancel(track, dreg); } EXPORT_SYMBOL_GPL(nft_reg_track_cancel); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg) { track->regs[dreg].selector = NULL; track->regs[dreg].bitwise = NULL; track->regs[dreg].num_reg = 0; } EXPORT_SYMBOL_GPL(__nft_reg_track_cancel); /* * Tables */ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list, lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; } } return ERR_PTR(-ENOENT); } static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, int family, u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; } } return ERR_PTR(-ENOENT); } static inline u64 nf_tables_alloc_handle(struct nft_table *table) { return ++table->hgenerator; } static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nft_chain_type * __nft_chain_type_get(u8 family, enum nft_chain_types type) { if (family >= NFPROTO_NUMPROTO || type >= NFT_CHAIN_T_MAX) return NULL; return chain_type[family][type]; } static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { const struct nft_chain_type *type; int i; for (i = 0; i < NFT_CHAIN_T_MAX; i++) { type = __nft_chain_type_get(family, i); if (!type) continue; if (!nla_strcmp(nla, type->name)) return type; } return NULL; } struct nft_module_request { struct list_head list; char module[MODULE_NAME_LEN]; bool done; }; #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return 0; nft_net = nft_pernet(net); list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; /* A request to load this module already exists. */ return -EAGAIN; } } req = kmalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; req->done = false; strscpy(req->module, module_name, MODULE_NAME_LEN); list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } EXPORT_SYMBOL_GPL(nft_request_module); #endif static void lockdep_nfnl_nft_mutex_not_held(void) { #ifdef CONFIG_PROVE_LOCKING if (debug_locks) WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); #endif } static const struct nft_chain_type * nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, u8 family, bool autoload) { const struct nft_chain_type *type; type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { if (nft_request_module(net, "nft-chain-%u-%.*s", family, nla_len(nla), (const char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static __be16 nft_base_seq(const struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); return htons(nft_net->base_seq & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle), NFTA_TABLE_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELTABLE) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags & NFT_TABLE_F_MASK))) goto nla_put_failure; if (nft_table_has_owner(table) && nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid))) goto nla_put_failure; if (table->udata) { if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) goto nla_put_failure; } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } struct nftnl_skb_parms { bool report; }; #define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb)) static void nft_notify_enqueue(struct sk_buff *skb, bool report, struct list_head *notify_list) { NFT_CB(skb).report = report; list_add_tail(&skb->list, notify_list); } static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (!nft_is_active(net, table)) continue; if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, NLM_F_MULTI, table->family, table) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *c) { int err; if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); err = netlink_dump_start(nlsk, skb, nlh, c); rcu_read_lock(); module_put(THIS_MODULE); return err; } /* called with rcu_read_lock held */ static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) goto err_fill_table_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_table_info: kfree_skb(skb2); return err; } static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt) { struct nft_chain *chain; u32 i = 0; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (!nft_is_base_chain(chain)) continue; if (cnt && i++ == cnt) break; nf_tables_unregister_hook(net, table, chain); } } static int nf_tables_table_enable(struct net *net, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (!nft_is_base_chain(chain)) continue; err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hooks; i++; } return 0; err_register_hooks: if (i) nft_table_disable(net, table, i); return err; } static void nf_tables_table_disable(struct net *net, struct nft_table *table) { table->flags &= ~NFT_TABLE_F_DORMANT; nft_table_disable(net, table, 0); table->flags |= NFT_TABLE_F_DORMANT; } #define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) #define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) #define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) #define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ __NFT_TABLE_F_WAS_AWAKEN | \ __NFT_TABLE_F_WAS_ORPHAN) static bool nft_table_pending_update(const struct nft_ctx *ctx) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *trans; if (ctx->table->flags & __NFT_TABLE_F_UPDATE) return true; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table == ctx->table && ((trans->msg_type == NFT_MSG_NEWCHAIN && nft_trans_chain_update(trans)) || (trans->msg_type == NFT_MSG_DELCHAIN && nft_is_base_chain(nft_trans_chain(trans))))) return true; } return false; } static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; u32 flags; int ret; if (!ctx->nla[NFTA_TABLE_FLAGS]) return 0; flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); if (flags & ~NFT_TABLE_F_MASK) return -EOPNOTSUPP; if (flags == (ctx->table->flags & NFT_TABLE_F_MASK)) return 0; if ((nft_table_has_owner(ctx->table) && !(flags & NFT_TABLE_F_OWNER)) || (flags & NFT_TABLE_F_OWNER && !nft_table_is_orphan(ctx->table))) return -EOPNOTSUPP; if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) return -ENOMEM; if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { ctx->table->flags |= NFT_TABLE_F_DORMANT; if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) { ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret < 0) goto err_register_hooks; ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT; } } if ((flags & NFT_TABLE_F_OWNER) && !nft_table_has_owner(ctx->table)) { ctx->table->nlpid = ctx->portid; ctx->table->flags |= NFT_TABLE_F_OWNER | __NFT_TABLE_F_WAS_ORPHAN; } nft_trans_table_update(trans) = true; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_register_hooks: ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } static u32 nft_chain_hash(const void *data, u32 len, u32 seed) { const char *name = data; return jhash(name, strlen(name), seed); } static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed) { const struct nft_chain *chain = data; return nft_chain_hash(chain->name, 0, seed); } static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct nft_chain *chain = ptr; const char *name = arg->key; return strcmp(chain->name, name); } static u32 nft_objname_hash(const void *data, u32 len, u32 seed) { const struct nft_object_hash_key *k = data; seed ^= hash_ptr(k->table, 32); return jhash(k->name, strlen(k->name), seed); } static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed) { const struct nft_object *obj = data; return nft_objname_hash(&obj->key, 0, seed); } static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct nft_object_hash_key *k = arg->key; const struct nft_object *obj = ptr; if (obj->key.table != k->table) return -1; return strcmp(obj->key.name, k->name); } static bool nft_supported_family(u8 family) { return false #ifdef CONFIG_NF_TABLES_INET || family == NFPROTO_INET #endif #ifdef CONFIG_NF_TABLES_IPV4 || family == NFPROTO_IPV4 #endif #ifdef CONFIG_NF_TABLES_ARP || family == NFPROTO_ARP #endif #ifdef CONFIG_NF_TABLES_NETDEV || family == NFPROTO_NETDEV #endif #if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) || family == NFPROTO_BRIDGE #endif #ifdef CONFIG_NF_TABLES_IPV6 || family == NFPROTO_IPV6 #endif ; } static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; u32 flags = 0; int err; if (!nft_supported_family(family)) return -EOPNOTSUPP; lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updtable(&ctx); } if (nla[NFTA_TABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); if (flags & ~NFT_TABLE_F_MASK) return -EOPNOTSUPP; } err = -ENOMEM; table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT); if (table == NULL) goto err_kzalloc; table->validate_state = nft_net->validate_state; table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (table->name == NULL) goto err_strdup; if (nla[NFTA_TABLE_USERDATA]) { table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT); if (table->udata == NULL) goto err_table_udata; table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]); } err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; if (table->flags & NFT_TABLE_F_OWNER) table->nlpid = NETLINK_CB(skb).portid; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err_trans; list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: kfree(table->udata); err_table_udata: kfree(table->name); err_strdup: kfree(table); err_kzalloc: return err; } static int nft_flush_table(struct nft_ctx *ctx) { struct nft_flowtable *flowtable, *nft; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_set *set, *ns; int err; list_for_each_entry(chain, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx->chain = chain; err = nft_delrule_by_chain(ctx); if (err < 0) goto out; } list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, set)) continue; if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); if (err < 0) goto out; } list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) { if (!nft_is_active_next(ctx->net, flowtable)) continue; err = nft_delflowtable(ctx, flowtable); if (err < 0) goto out; } list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { if (!nft_is_active_next(ctx->net, obj)) continue; err = nft_delobj(ctx, obj); if (err < 0) goto out; } list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx->chain = chain; err = nft_delchain(ctx); if (err < 0) goto out; } err = nft_deltable(ctx); out: return err; } static int nft_flush(struct nft_ctx *ctx, int family) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nlattr * const *nla = ctx->nla; struct nft_table *table, *nt; int err = 0; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; ctx->family = table->family; if (!nft_is_active_next(ctx->net, table)) continue; if (nft_table_has_owner(table) && table->nlpid != ctx->portid) continue; if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; ctx->table = table; err = nft_flush_table(ctx); if (err < 0) goto out; } out: return err; } static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla); if (family == AF_UNSPEC || (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; table = nft_table_lookup_byhandle(net, attr, family, genmask, NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); } if (IS_ERR(table)) { if (PTR_ERR(table) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); } if (info->nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) return -EBUSY; ctx.family = family; ctx.table = table; return nft_flush_table(&ctx); } static void nf_tables_table_destroy(struct nft_table *table) { if (WARN_ON(table->use > 0)) return; rhltable_destroy(&table->chains_ht); kfree(table->name); kfree(table->udata); kfree(table); } void nft_register_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); return; } chain_type[ctype->family][ctype->type] = ctype; nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_register_chain_type); void nft_unregister_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); chain_type[ctype->family][ctype->type] = NULL; nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_chain_type); /* * Chains */ static struct nft_chain * nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->handle == handle && nft_active_genmask(chain, genmask)) return chain; } return ERR_PTR(-ENOENT); } static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING struct nftables_pernet *nft_net = nft_pernet(net); return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif } static struct nft_chain *nft_chain_lookup(struct net *net, struct nft_table *table, const struct nlattr *nla, u8 genmask) { char search[NFT_CHAIN_MAXNAMELEN + 1]; struct rhlist_head *tmp, *list; struct nft_chain *chain; if (nla == NULL) return ERR_PTR(-EINVAL); nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); chain = ERR_PTR(-ENOENT); rcu_read_lock(); list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params); if (!list) goto out_unlock; rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { if (nft_active_genmask(chain, genmask)) goto out_unlock; } chain = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return chain; } static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, [NFTA_CHAIN_NAME] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, [NFTA_CHAIN_TYPE] = { .type = NLA_STRING, .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, [NFTA_CHAIN_ID] = { .type = NLA_U32 }, [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_HOOK_DEV] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) { struct nft_stats *cpu_stats, total; struct nlattr *nest; unsigned int seq; u64 pkts, bytes; int cpu; if (!stats) return 0; memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { seq = u64_stats_fetch_begin(&cpu_stats->syncp); pkts = cpu_stats->pkts; bytes = cpu_stats->bytes; } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq)); total.pkts += pkts; total.bytes += bytes; } nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) goto nla_put_failure; if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts), NFTA_COUNTER_PAD) || nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -ENOSPC; } static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, const struct list_head *hook_list) { const struct nf_hook_ops *ops = &basechain->ops; struct nft_hook *hook, *first = NULL; struct nlattr *nest, *nest_devs; int n = 0; nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; if (nft_base_chain_netdev(family, ops->hooknum)) { nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; if (!hook_list) hook_list = &basechain->hook_list; list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { if (!first) first = hook; if (nla_put(skb, NFTA_DEVICE_NAME, hook->ifnamelen, hook->ifname)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && nla_put(skb, NFTA_HOOK_DEV, first->ifnamelen, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct list_head *hook_list) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) || nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) || nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), NFTA_CHAIN_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELCHAIN && !hook_list) { nlmsg_end(skb, nlh); return 0; } if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, htonl(basechain->policy))) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; stats = rcu_dereference_check(basechain->stats, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; } if (chain->flags && nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; if (chain->udata && nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, const struct list_head *hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, ctx->chain, hook_list); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; const struct nft_table *table; const struct nft_chain *chain; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; list_for_each_entry_rcu(chain, &table->chains, list) { if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (!nft_is_active(net, chain)) continue; if (nf_tables_fill_chain_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, chain, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } /* called with rcu_read_lock held */ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain, NULL); if (err < 0) goto err_fill_chain_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_chain_info: kfree_skb(skb2); return err; } static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) { struct nlattr *tb[NFTA_COUNTER_MAX+1]; struct nft_stats __percpu *newstats; struct nft_stats *stats; int err; err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. */ preempt_disable(); stats = this_cpu_ptr(newstats); stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); preempt_enable(); return newstats; } static void nft_chain_stats_replace(struct nft_trans_chain *trans) { const struct nft_trans *t = &trans->nft_trans_binding.nft_trans; struct nft_base_chain *chain = nft_base_chain(trans->chain); if (!trans->stats) return; trans->stats = rcu_replace_pointer(chain->stats, trans->stats, lockdep_commit_lock_is_held(t->net)); if (!trans->stats) static_branch_inc(&nft_counters_enabled); } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) { struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0); struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1); if (g0 != g1) kvfree(g1); kvfree(g0); /* should be NULL either via abort or via successful commit */ WARN_ON_ONCE(chain->blob_next); kvfree(chain->blob_next); } void nf_tables_chain_destroy(struct nft_chain *chain) { const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) return; /* no concurrent access possible anymore */ nf_tables_chain_free_chain_rules(chain); if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { static_branch_dec(&nft_counters_enabled); free_percpu(rcu_dereference_raw(basechain->stats)); } kfree(chain->name); kfree(chain->udata); kfree(basechain); } else { kfree(chain->name); kfree(chain->udata); kfree(chain); } } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { struct net_device *dev; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); if (!hook) { err = -ENOMEM; goto err_hook_alloc; } err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); if (err < 0) goto err_hook_dev; hook->ifnamelen = nla_len(attr); /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ dev = __dev_get_by_name(net, hook->ifname); if (!dev) { err = -ENOENT; goto err_hook_dev; } hook->ops.dev = dev; return hook; err_hook_dev: kfree(hook); err_hook_alloc: return ERR_PTR(err); } static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, const struct nft_hook *this) { struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { if (!strcmp(hook->ifname, this->ifname)) return hook; } return NULL; } static int nf_tables_parse_netdev_hooks(struct net *net, const struct nlattr *attr, struct list_head *hook_list, struct netlink_ext_ack *extack) { struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; nla_for_each_nested(tmp, attr, rem) { if (nla_type(tmp) != NFTA_DEVICE_NAME) { err = -EINVAL; goto err_hook; } hook = nft_netdev_hook_alloc(net, tmp); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); goto err_hook; } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); kfree(hook); err = -EEXIST; goto err_hook; } list_add_tail(&hook->list, hook_list); n++; if (n == NFT_NETDEVICE_MAX) { err = -EFBIG; goto err_hook; } } return 0; err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); kfree(hook); } return err; } struct nft_chain_hook { u32 num; s32 priority; const struct nft_chain_type *type; struct list_head list; }; static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], struct list_head *hook_list, struct netlink_ext_ack *extack, u32 flags) { struct nft_hook *hook; int err; if (tb[NFTA_HOOK_DEV]) { hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); } list_add_tail(&hook->list, hook_list); } else if (tb[NFTA_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS], hook_list, extack); if (err < 0) return err; } if (flags & NFT_CHAIN_HW_OFFLOAD && list_empty(hook_list)) return -EINVAL; return 0; } static int nft_chain_parse_hook(struct net *net, struct nft_base_chain *basechain, const struct nlattr * const nla[], struct nft_chain_hook *hook, u8 family, u32 flags, struct netlink_ext_ack *extack) { struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], nft_hook_policy, NULL); if (err < 0) return err; if (!basechain) { if (!ha[NFTA_HOOK_HOOKNUM] || !ha[NFTA_HOOK_PRIORITY]) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return -ENOENT; } hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT); if (!type) return -EOPNOTSUPP; if (nla[NFTA_CHAIN_TYPE]) { type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE], family, true); if (IS_ERR(type)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return PTR_ERR(type); } } if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && hook->priority <= NF_IP_PRI_CONNTRACK) return -EOPNOTSUPP; } else { if (ha[NFTA_HOOK_HOOKNUM]) { hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); if (hook->num != basechain->ops.hooknum) return -EOPNOTSUPP; } if (ha[NFTA_HOOK_PRIORITY]) { hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); if (hook->priority != basechain->ops.priority) return -EOPNOTSUPP; } if (nla[NFTA_CHAIN_TYPE]) { type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE], family); if (!type) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return -ENOENT; } } else { type = basechain->type; } } if (!try_module_get(type->owner)) { if (nla[NFTA_CHAIN_TYPE]) NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]); return -ENOENT; } hook->type = type; INIT_LIST_HEAD(&hook->list); if (nft_base_chain_netdev(family, hook->num)) { err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags); if (err < 0) { module_put(type->owner); return err; } } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) { module_put(type->owner); return -EOPNOTSUPP; } return 0; } static void nft_chain_release_hook(struct nft_chain_hook *hook) { struct nft_hook *h, *next; list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); kfree(h); } module_put(hook->type->owner); } static void nft_last_rule(const struct nft_chain *chain, const void *ptr) { struct nft_rule_dp_last *lrule; BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0); lrule = (struct nft_rule_dp_last *)ptr; lrule->end.is_last = 1; lrule->chain = chain; /* blob size does not include the trailer rule */ } static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain, unsigned int size) { struct nft_rule_blob *blob; if (size > INT_MAX) return NULL; size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last); blob = kvmalloc(size, GFP_KERNEL_ACCOUNT); if (!blob) return NULL; blob->size = 0; nft_last_rule(chain, blob->data); return blob; } static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { ops->pf = family; ops->hooknum = hook->num; ops->priority = hook->priority; ops->priv = chain; ops->hook = hook->type->hooks[ops->hooknum]; ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; struct nft_hook *h; basechain->type = hook->type; INIT_LIST_HEAD(&basechain->hook_list); chain = &basechain->chain; if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); list_for_each_entry(h, &basechain->hook_list, list) nft_basechain_hook_init(&h->ops, family, hook, chain); } nft_basechain_hook_init(&basechain->ops, family, hook, chain); chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && !nft_chain_offload_support(basechain)) { list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; } flow_block_init(&basechain->flow_block); return 0; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; err = rhltable_insert_key(&table->chains_ht, chain->name, &chain->rhlhead, nft_chain_ht_params); if (err) return err; list_add_tail_rcu(&chain->list, &table->chains); return 0; } static u64 chain_id; static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; struct net *net = ctx->net; char name[NFT_NAME_MAXLEN]; struct nft_rule_blob *blob; struct nft_trans *trans; struct nft_chain *chain; int err; if (nla[NFTA_CHAIN_HOOK]) { struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; if (table->flags & __NFT_TABLE_F_UPDATE) return -EINVAL; if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags, extack); if (err < 0) return err; basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT); if (basechain == NULL) { nft_chain_release_hook(&hook); return -ENOMEM; } chain = &basechain->chain; if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } err = nft_basechain_init(basechain, family, &hook, flags); if (err < 0) { nft_chain_release_hook(&hook); kfree(basechain); free_percpu(stats); return err; } if (stats) static_branch_inc(&nft_counters_enabled); } else { if (flags & NFT_CHAIN_BASE) return -EINVAL; if (flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT); if (chain == NULL) return -ENOMEM; chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; if (nla[NFTA_CHAIN_NAME]) { chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); } else { if (!(flags & NFT_CHAIN_BINDING)) { err = -EINVAL; goto err_destroy_chain; } snprintf(name, sizeof(name), "__chain%llu", ++chain_id); chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT); } if (!chain->name) { err = -ENOMEM; goto err_destroy_chain; } if (nla[NFTA_CHAIN_USERDATA]) { chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT); if (chain->udata == NULL) { err = -ENOMEM; goto err_destroy_chain; } chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } blob = nf_tables_chain_alloc_rules(chain, 0); if (!blob) { err = -ENOMEM; goto err_destroy_chain; } RCU_INIT_POINTER(chain->blob_gen_0, blob); RCU_INIT_POINTER(chain->blob_gen_1, blob); if (!nft_use_inc(&table->use)) { err = -EMFILE; goto err_destroy_chain; } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; if (nft_is_base_chain(chain)) nft_trans_chain_policy(trans) = policy; err = nft_chain_add(table, chain); if (err < 0) goto err_chain_add; /* This must be LAST to ensure no packets are walking over this chain. */ err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err_register_hook; return 0; err_register_hook: nft_chain_del(chain); err_chain_add: nft_trans_destroy(trans); err_trans: nft_use_dec_restore(&table->use); err_destroy_chain: nf_tables_chain_destroy(chain); return err; } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, u32 flags, const struct nlattr *attr, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_base_chain *basechain = NULL; struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; struct nft_stats __percpu *stats = NULL; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; int err; if (chain->flags ^ flags) return -EOPNOTSUPP; INIT_LIST_HEAD(&hook.list); if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } basechain = nft_base_chain(chain); err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook, ctx->family, flags, extack); if (err < 0) return err; if (basechain->type != hook.type) { nft_chain_release_hook(&hook); NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { h->ops.pf = basechain->ops.pf; h->ops.hooknum = basechain->ops.hooknum; h->ops.priority = basechain->ops.priority; h->ops.priv = basechain->ops.priv; h->ops.hook = basechain->ops.hook; if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); kfree(h); } } } else { ops = &basechain->ops; if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } } if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); err = -EEXIST; goto err_hooks; } } if (table->flags & __NFT_TABLE_F_UPDATE && !list_empty(&hook.list)) { NL_SET_BAD_ATTR(extack, attr); err = -EOPNOTSUPP; goto err_hooks; } if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { basechain = nft_base_chain(chain); ops = &basechain->ops; if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; } } unregister = true; if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; goto err_hooks; } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR_PCPU(stats)) { err = PTR_ERR_PCPU(stats); goto err_hooks; } } err = -ENOMEM; trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN); if (trans == NULL) goto err_trans; nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; if (nla[NFTA_CHAIN_POLICY]) nft_trans_chain_policy(trans) = policy; else nft_trans_chain_policy(trans) = -1; if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *tmp; char *name; err = -ENOMEM; name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT); if (!name) goto err_trans; err = -EEXIST; list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && tmp->table == table && nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); goto err_trans; } } nft_trans_chain_name(trans) = name; } nft_trans_basechain(trans) = basechain; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&hook.list, &nft_trans_chain_hooks(trans)); if (nla[NFTA_CHAIN_HOOK]) module_put(hook.type->owner); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_trans: free_percpu(stats); kfree(trans); err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { if (unregister) nf_unregister_net_hook(ctx->net, &h->ops); list_del(&h->list); kfree_rcu(h, rcu); } module_put(hook.type->owner); } return err; } static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWCHAIN && nft_trans_chain(trans)->table == table && id == nft_trans_chain_id(trans) && nft_active_genmask(nft_trans_chain(trans), genmask)) return nft_trans_chain(trans); } return ERR_PTR(-ENOENT); } static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; u32 flags = 0; lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } chain = NULL; attr = nla[NFTA_CHAIN_NAME]; if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); chain = nft_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]); return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } chain = NULL; } } else if (!nla[NFTA_CHAIN_ID]) { return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { if (chain != NULL && !nft_is_base_chain(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; } if (chain == NULL && nla[NFTA_CHAIN_HOOK] == NULL) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; } policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); switch (policy) { case NF_DROP: case NF_ACCEPT: break; default: return -EINVAL; } } if (nla[NFTA_CHAIN_FLAGS]) flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS])); else if (chain) flags = chain->flags; if (flags & ~NFT_CHAIN_FLAGS) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { if (chain->flags & NFT_CHAIN_BINDING) return -EINVAL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags, attr, extack); } return nf_tables_addchain(&ctx, family, policy, flags, extack); } static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { const struct nft_chain *chain = &basechain->chain; const struct nlattr * const *nla = ctx->nla; struct nft_chain_hook chain_hook = {}; struct nft_hook *this, *hook; LIST_HEAD(chain_del_list); struct nft_trans *trans; int err; if (ctx->table->flags & __NFT_TABLE_F_UPDATE) return -EOPNOTSUPP; err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) return err; list_for_each_entry(this, &chain_hook.list, list) { hook = nft_hook_list_find(&basechain->hook_list, this); if (!hook) { err = -ENOENT; goto err_chain_del_hook; } list_move(&hook->list, &chain_del_list); } trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); if (!trans) { err = -ENOMEM; goto err_chain_del_hook; } nft_trans_basechain(trans) = basechain; nft_trans_chain_update(trans) = true; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&chain_del_list, &nft_trans_chain_hooks(trans)); nft_chain_release_hook(&chain_hook); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_chain_del_hook: list_splice(&chain_del_list, &basechain->hook_list); nft_chain_release_hook(&chain_hook); return err; } static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; struct nft_ctx ctx; u64 handle; u32 use; int err; table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); } if (nla[NFTA_CHAIN_HANDLE]) { attr = nla[NFTA_CHAIN_HANDLE]; handle = be64_to_cpu(nla_get_be64(attr)); chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { attr = nla[NFTA_CHAIN_NAME]; chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { if (PTR_ERR(chain) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); } if (nft_chain_binding(chain)) return -EOPNOTSUPP; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) return nft_delchain_hook(&ctx, basechain, extack); } } if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; use = chain->use; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) continue; use--; err = nft_delrule(&ctx, rule); if (err < 0) return err; } /* There are rules and elements that are still holding references to us, * we cannot do a recursive removal in this case. */ if (use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } return nft_delchain(&ctx); } /* * Expressions */ /** * nft_register_expr - register nf_tables expr type * @type: expr type * * Registers the expr type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. */ int nft_register_expr(struct nft_expr_type *type) { if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) return -ENOMEM; nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); else list_add_rcu(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } EXPORT_SYMBOL_GPL(nft_register_expr); /** * nft_unregister_expr - unregister nf_tables expr type * @type: expr type * * Unregisters the expr typefor use with nf_tables. */ void nft_unregister_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_expr); static const struct nft_expr_type *__nft_expr_type_get(u8 family, struct nlattr *nla) { const struct nft_expr_type *type, *candidate = NULL; list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; else if (type->family == family) candidate = type; } } return candidate; } #ifdef CONFIG_MODULES static int nft_expr_type_request_module(struct net *net, u8 family, struct nlattr *nla) { if (nft_request_module(net, "nft-expr-%u-%.*s", family, nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return -EAGAIN; return 0; } #endif static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); rcu_read_lock(); type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); if (nft_request_module(net, "nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { [NFTA_EXPR_NAME] = { .type = NLA_STRING, .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_EXPR_DATA] = { .type = NLA_NESTED }, }; static int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; if (expr->ops->dump) { struct nlattr *data = nla_nest_start_noflag(skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } return skb->len; nla_put_failure: return -1; }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } struct nft_expr_info { const struct nft_expr_ops *ops; const struct nlattr *attr; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; }; static int nf_tables_expr_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info) { const struct nft_expr_type *type; const struct nft_expr_ops *ops; struct nlattr *tb[NFTA_EXPR_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); if (err < 0) return err; type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); if (tb[NFTA_EXPR_DATA]) { err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) goto err1; } else memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1)); if (type->select_ops != NULL) { ops = type->select_ops(ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); #ifdef CONFIG_MODULES if (err == -EAGAIN) if (nft_expr_type_request_module(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]) != -EAGAIN) err = -ENOENT; #endif goto err1; } } else ops = type->ops; info->attr = nla; info->ops = ops; return 0; err1: module_put(type->owner); return err; } int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info) { struct nlattr *tb[NFTA_EXPR_MAX + 1]; const struct nft_expr_type *type; int err; err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); if (err < 0) return err; if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; rcu_read_lock(); type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); if (!type) { err = -ENOENT; goto out_unlock; } if (!type->inner_ops) { err = -EOPNOTSUPP; goto out_unlock; } err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) goto out_unlock; info->attr = nla; info->ops = type->inner_ops; /* No module reference will be taken on type->owner. * Presence of type->inner_ops implies that the expression * is builtin, so it cannot go away. */ rcu_read_unlock(); return 0; out_unlock: rcu_read_unlock(); return err; } static int nf_tables_newexpr(const struct nft_ctx *ctx, const struct nft_expr_info *expr_info, struct nft_expr *expr) { const struct nft_expr_ops *ops = expr_info->ops; int err; expr->ops = ops; if (ops->init) { err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb); if (err < 0) goto err1; } return 0; err1: expr->ops = NULL; return err; } static void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { const struct nft_expr_type *type = expr->ops->type; if (expr->ops->destroy) expr->ops->destroy(ctx, expr); module_put(type->owner); } static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, const struct nlattr *nla) { struct nft_expr_info expr_info; struct nft_expr *expr; struct module *owner; int err; err = nf_tables_expr_parse(ctx, nla, &expr_info); if (err < 0) goto err_expr_parse; err = -EOPNOTSUPP; if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL)) goto err_expr_stateful; err = -ENOMEM; expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT); if (expr == NULL) goto err_expr_stateful; err = nf_tables_newexpr(ctx, &expr_info, expr); if (err < 0) goto err_expr_new; return expr; err_expr_new: kfree(expr); err_expr_stateful: owner = expr_info.ops->type->owner; if (expr_info.ops->type->release_ops) expr_info.ops->type->release_ops(expr_info.ops); module_put(owner); err_expr_parse: return ERR_PTR(err); } int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; if (WARN_ON_ONCE(!src->ops->clone)) return -EINVAL; dst->ops = src->ops; err = src->ops->clone(dst, src, gfp); if (err < 0) return err; __module_get(src->ops->type->owner); return 0; } void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { nf_tables_expr_destroy(ctx, expr); kfree(expr); } /* * Rules */ static struct nft_rule *__nft_rule_lookup(const struct net *net, const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks list_for_each_entry_rcu(rule, &chain->rules, list, lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } return ERR_PTR(-ENOENT); } static struct nft_rule *nft_rule_lookup(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule, u64 handle, bool reset) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle), NFTA_RULE_PAD)) goto nla_put_failure; if (event != NFT_MSG_DELRULE && handle) { if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } if (chain->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_stats(chain, rule); list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); if (rule->udata) { struct nft_userdata *udata = nft_userdata(rule); if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1, udata->data) < 0) goto nla_put_failure; } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_rule *prule; struct sk_buff *skb; u64 handle = 0; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (event == NFT_MSG_NEWRULE && !list_is_first(&rule->list, &ctx->chain->rules) && !list_is_last(&rule->list, &ctx->chain->rules)) { prule = list_prev_entry(rule, list); handle = prule->handle; } if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) flags |= NLM_F_APPEND; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static void audit_log_rule_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); } struct nft_rule_dump_ctx { unsigned int s_idx; char *table; char *chain; bool reset; }; static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, const struct nft_chain *chain) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int entries = 0; int ret = 0; u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont_skip; if (*idx < ctx->s_idx) goto cont; if (prule) handle = prule->handle; else handle = 0; if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, table, chain, rule, handle, ctx->reset) < 0) { ret = 1; break; } entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: prule = rule; cont_skip: (*idx)++; } if (ctx->reset && entries) audit_log_rule_reset(table, cb->seq, entries); return ret; } static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; if (ctx->table && strcmp(ctx->table, table->name) != 0) continue; if (ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, nft_chain_ht_params); if (!list) goto done; rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, cb, table, chain); break; } goto done; } list_for_each_entry_rcu(chain, &table->chains, list) { if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) goto done; } if (ctx->table) break; } done: rcu_read_unlock(); ctx->s_idx = idx; return skb->len; } static int nf_tables_dumpreset_rules(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); int ret; /* Mutex is held is to prevent that two concurrent dump-and-reset calls * do not underrun counters and quotas. The commit_mutex is used for * the lack a better lock, this is not transaction path. */ mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_rules(skb, cb); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (nla[NFTA_RULE_TABLE]) { ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC); if (!ctx->table) return -ENOMEM; } if (nla[NFTA_RULE_CHAIN]) { ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC); if (!ctx->chain) { kfree(ctx->table); return -ENOMEM; } } return 0; } static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; ctx->reset = true; return nf_tables_dump_rules_start(cb); } static int nf_tables_dump_rules_done(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; kfree(ctx->table); kfree(ctx->chain); return 0; } /* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; int err; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return ERR_CAST(table); } chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return ERR_CAST(chain); } rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return ERR_CAST(rule); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); err = nf_tables_fill_rule_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, 0, reset); if (err < 0) { kfree_skb(skb2); return ERR_PTR(err); } return skb2; } static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dump_rules_start, .dump = nf_tables_dump_rules, .done = nf_tables_dump_rules_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } skb2 = nf_tables_getrule_single(portid, info, nla, false); if (IS_ERR(skb2)) return PTR_ERR(skb2); return nfnetlink_unicast(skb2, net, portid); } static int nf_tables_getrule_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dumpreset_rules_start, .dump = nf_tables_dumpreset_rules, .done = nf_tables_dump_rules_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); skb2 = nf_tables_getrule_single(portid, info, nla, true); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); if (IS_ERR(skb2)) return PTR_ERR(skb2); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), nft_net->base_seq); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); return nfnetlink_unicast(skb2, net, portid); } void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); expr = next; } kfree(rule); } /* can only be used if rule is no longer visible to dumps */ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { lockdep_commit_lock_is_held(ctx->net); nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } /** nft_chain_validate - loop detection and hook validation * * @ctx: context containing call depth and base chain * @chain: chain to validate * * Walk through the rules of the given chain and chase all jumps/gotos * and set lookups until either the jump limit is hit or all reachable * chains have been validated. */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; struct nft_rule *rule; int err; if (ctx->level == NFT_JUMP_STACK_SIZE) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { if (fatal_signal_pending(current)) return -EINTR; if (!nft_is_active_next(ctx->net, rule)) continue; nft_rule_for_each_expr(expr, last, rule) { if (!expr->ops->validate) continue; /* This may call nft_chain_validate() recursively, * callers that do so must increment ctx->level. */ err = expr->ops->validate(ctx, expr); if (err < 0) return err; } } return 0; } EXPORT_SYMBOL_GPL(nft_chain_validate); static int nft_table_validate(struct net *net, const struct nft_table *table) { struct nft_chain *chain; struct nft_ctx ctx = { .net = net, .family = table->family, }; int err; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; ctx.chain = chain; err = nft_chain_validate(&ctx, chain); if (err < 0) return err; cond_resched(); } return 0; } int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; if (!nft_set_elem_active(ext, iter->genmask)) return 0; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; data = nft_set_ext_data(ext); switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; pctx->level--; break; default: break; } return 0; } int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter dummy_iter = { .genmask = nft_genmask_next(ctx->net), }; struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } return ret; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); #define NFT_RULE_MAXEXPRS 128 static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; unsigned int size, i, n, ulen = 0, usize = 0; u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; u8 family = info->nfmsg->nfgen_family; struct nft_flow_rule *flow = NULL; struct net *net = info->net; struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; struct nft_trans *trans; u64 handle, pos_handle; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; int err, rem; lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_RULE_CHAIN]) { chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } } else if (nla[NFTA_RULE_CHAIN_ID]) { chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); return PTR_ERR(chain); } } else { return -EINVAL; } if (nft_chain_is_bound(chain)) return -EOPNOTSUPP; if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else return -EOPNOTSUPP; } else { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) || info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); } } else if (nla[NFTA_RULE_POSITION_ID]) { old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); return PTR_ERR(old_rule); } } } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), GFP_KERNEL); if (!expr_info) return -ENOMEM; nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) goto err_release_expr; err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) { NL_SET_BAD_ATTR(extack, tmp); goto err_release_expr; } size += expr_info[n].ops->size; n++; } } /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); if (ulen > 0) usize = sizeof(struct nft_userdata) + ulen; } err = -ENOMEM; rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT); if (rule == NULL) goto err_release_expr; nft_activate_next(net, rule); rule->handle = handle; rule->dlen = size; rule->udata = ulen ? 1 : 0; if (ulen) { udata = nft_userdata(rule); udata->len = ulen - 1; nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen); } expr = nft_expr_first(rule); for (i = 0; i < n; i++) { err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { NL_SET_BAD_ATTR(extack, expr_info[i].attr); goto err_release_rule; } if (expr_info[i].ops->validate) nft_validate_state_update(table, NFT_VALIDATE_NEED); expr_info[i].ops = NULL; expr = nft_expr_next(expr); } if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { flow = nft_flow_rule_create(net, rule); if (IS_ERR(flow)) { err = PTR_ERR(flow); goto err_release_rule; } } if (!nft_use_inc(&chain->use)) { err = -EMFILE; goto err_release_rule; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_chain_binding(chain)) { err = -EOPNOTSUPP; goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) goto err_destroy_flow_rule; trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; goto err_destroy_flow_rule; } if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); else list_add_tail_rcu(&rule->list, &chain->rules); } else { if (old_rule) list_add_tail_rcu(&rule->list, &old_rule->list); else list_add_rcu(&rule->list, &chain->rules); } } kvfree(expr_info); if (flow) nft_trans_flow_rule(trans) = flow; if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; err_destroy_flow_rule: nft_use_dec_restore(&chain->use); if (flow) nft_flow_rule_destroy(flow); err_release_rule: nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { module_put(expr_info[i].ops->type->owner); if (expr_info[i].ops->type->release_ops) expr_info[i].ops->type->release_ops(expr_info[i].ops); } } kvfree(expr_info); return err; } static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE && nft_trans_rule_chain(trans) == chain && id == nft_trans_rule_id(trans)) return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; struct nft_table *table; struct nft_rule *rule; struct nft_ctx ctx; int err = 0; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_RULE_CHAIN]) { chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) return 0; NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } if (nft_chain_binding(chain)) return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { if (PTR_ERR(rule) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) return 0; NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); } err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); } err = nft_delrule(&ctx, rule); } else { err = nft_delrule_by_chain(&ctx); } } else { list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; if (nft_chain_binding(chain)) continue; ctx.chain = chain; err = nft_delrule_by_chain(&ctx); if (err < 0) break; } } return err; } /* * Sets */ static const struct nft_set_type *nft_set_types[] = { &nft_set_hash_fast_type, &nft_set_hash_type, &nft_set_rhash_type, &nft_set_bitmap_type, &nft_set_rbtree_type, #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) &nft_set_pipapo_avx2_type, #endif &nft_set_pipapo_type, }; #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ NFT_SET_EVAL) static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) { return (flags & type->features) == (flags & NFT_SET_FEATURES); } /* * Select a set implementation based on the data characteristics and the * given policy. The total memory use might not be known if no size is * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; int i; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); bops = NULL; best.size = ~0; best.lookup = ~0; best.space = ~0; for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) { type = nft_set_types[i]; ops = &type->ops; if (!nft_set_ops_candidate(type, flags)) continue; if (!ops->estimate(desc, flags, &est)) continue; switch (desc->policy) { case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; if (est.lookup == best.lookup && est.space < best.space) break; continue; case NFT_SET_POL_MEMORY: if (!desc->size) { if (est.space < best.space) break; if (est.space == best.space && est.lookup < best.lookup) break; } else if (est.size < best.size || !bops) { break; } continue; default: break; } bops = ops; best = est; } if (bops != NULL) return bops; return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, [NFTA_SET_POLICY] = { .type = NLA_U32 }, [NFTA_SET_DESC] = { .type = NLA_NESTED }, [NFTA_SET_ID] = { .type = NLA_U32 }, [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; static struct nft_set *nft_set_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; if (nla == NULL) return ERR_PTR(-EINVAL); list_for_each_entry_rcu(set, &table->sets, list, lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; list_for_each_entry(set, &table->sets, list) { if (be64_to_cpu(nla_get_be64(nla)) == set->handle && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans_set *trans; /* its likely the id we need is at the tail, not at start */ list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { struct nft_set *set = trans->set; if (id == trans->set_id && set->table == table && nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask) { struct nft_set *set; set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; set = nft_set_lookup_byid(net, table, nla_set_id, genmask); } return set; } EXPORT_SYMBOL_GPL(nft_set_lookup_global); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) { const struct nft_set *i; const char *p; unsigned long *inuse; unsigned int n = 0, min = 0; p = strchr(name, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) return -EINVAL; inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; if (!nft_is_active_next(ctx->net, i)) continue; if (!sscanf(i->name, name, &tmp)) continue; if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) continue; set_bit(tmp - min, inuse); } n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); if (n >= BITS_PER_BYTE * PAGE_SIZE) { min += BITS_PER_BYTE * PAGE_SIZE; memset(inuse, 0, PAGE_SIZE); goto cont; } free_page((unsigned long)inuse); } set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n); if (!set->name) return -ENOMEM; list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; if (!strcmp(set->name, i->name)) { kfree(set->name); set->name = NULL; return -ENFILE; } } return 0; } int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) { u64 ms = be64_to_cpu(nla_get_be64(nla)); u64 max = (u64)(~((u64)0)); max = div_u64(max, NSEC_PER_MSEC); if (ms >= max) return -ERANGE; ms *= NSEC_PER_MSEC; *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } __be64 nf_jiffies64_to_msecs(u64 input) { return cpu_to_be64(jiffies64_to_msecs(input)); } static int nf_tables_fill_set_concat(struct sk_buff *skb, const struct nft_set *set) { struct nlattr *concat, *field; int i; concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); if (!concat) return -ENOMEM; for (i = 0; i < set->field_count; i++) { field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (!field) return -ENOMEM; if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, htonl(set->field_len[i]))) return -ENOMEM; nla_nest_end(skb, field); } nla_nest_end(skb, concat); return 0; } static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) { if (ops->usize) return ops->usize(size); return size; } static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { u64 timeout = READ_ONCE(set->timeout); u32 gc_int = READ_ONCE(set->gc_int); u32 portid = ctx->portid; struct nlmsghdr *nlh; struct nlattr *nest; u32 seq = ctx->seq; int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, NFNETLINK_V0, nft_base_seq(ctx->net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle), NFTA_SET_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELSET) { nlmsg_end(skb, nlh); return 0; } if (set->flags != 0) if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen))) goto nla_put_failure; if (set->flags & NFT_SET_MAP) { if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) goto nla_put_failure; } if (set->flags & NFT_SET_OBJECT && nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) goto nla_put_failure; if (timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, nf_jiffies64_to_msecs(timeout), NFTA_SET_PAD)) goto nla_put_failure; if (gc_int && nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int))) goto nla_put_failure; if (set->policy != NFT_SET_POL_PERFORMANCE) { if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) goto nla_put_failure; } if (set->udata && nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); if (!nest) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && nf_tables_fill_set_concat(skb, set)) goto nla_put_failure; nla_nest_end(skb, nest); if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (set->num_exprs > 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < set->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, set->exprs[i], false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); u32 portid = ctx->portid; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; if (ctx->table && ctx->table != table) continue; if (cur_table) { if (cur_table != table) continue; cur_table = NULL; } idx = 0; list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; if (!nft_is_active(net, set)) goto cont; ctx_set = *ctx; ctx_set.table = table; ctx_set.family = table->family; if (nf_tables_fill_set(skb, &ctx_set, set, NFT_MSG_NEWSET, NLM_F_MULTI) < 0) { cb->args[0] = idx; cb->args[2] = (unsigned long) table; goto done; } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } if (s_idx) s_idx = 0; } cb->args[1] = 1; done: rcu_read_unlock(); return skb->len; } static int nf_tables_dump_sets_start(struct netlink_callback *cb) { struct nft_ctx *ctx_dump = NULL; ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC); if (ctx_dump == NULL) return -ENOMEM; cb->data = ctx_dump; return 0; } static int nf_tables_dump_sets_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } /* called with rcu_read_lock held */ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_table *table = NULL; struct net *net = info->net; const struct nft_set *set; struct sk_buff *skb2; struct nft_ctx ctx; int err; if (nla[NFTA_SET_TABLE]) { table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, .done = nf_tables_dump_sets_done, .data = &ctx, .module = THIS_MODULE, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } /* Only accept unspec with dump */ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0); if (err < 0) goto err_fill_set_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_set_info: kfree_skb(skb2); return err; } static int nft_set_desc_concat_parse(const struct nlattr *attr, struct nft_set_desc *desc) { struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; u32 len; int err; if (desc->field_count >= ARRAY_SIZE(desc->field_len)) return -E2BIG; err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, nft_concat_policy, NULL); if (err < 0) return err; if (!tb[NFTA_SET_FIELD_LEN]) return -EINVAL; len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); if (!len || len > U8_MAX) return -EINVAL; desc->field_len[desc->field_count++] = len; return 0; } static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; nla_for_each_nested(attr, nla, rem) { if (nla_type(attr) != NFTA_LIST_ELEM) return -EINVAL; err = nft_set_desc_concat_parse(attr, desc); if (err < 0) return err; } for (i = 0; i < desc->field_count; i++) len += round_up(desc->field_len[i], sizeof(u32)); if (len != desc->klen) return -EINVAL; num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; return 0; } static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { struct nlattr *da[NFTA_SET_DESC_MAX + 1]; int err; err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy, NULL); if (err < 0) return err; if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); if (da[NFTA_SET_DESC_CONCAT]) err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); return err; } static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr * const *nla, struct nft_expr **exprs, int *num_exprs, u32 flags) { struct nft_expr *expr; int err, i; if (nla[NFTA_SET_EXPR]) { expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_expr_alloc; } exprs[0] = expr; (*num_exprs)++; } else if (nla[NFTA_SET_EXPRESSIONS]) { struct nlattr *tmp; int left; if (!(flags & NFT_SET_EXPR)) { err = -EINVAL; goto err_set_expr_alloc; } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { err = -E2BIG; goto err_set_expr_alloc; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; goto err_set_expr_alloc; } expr = nft_set_elem_expr_alloc(ctx, set, tmp); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_expr_alloc; } exprs[i++] = expr; (*num_exprs)++; } } return 0; err_set_expr_alloc: for (i = 0; i < *num_exprs; i++) nft_expr_destroy(ctx, exprs[i]); return err; } static bool nft_set_is_same(const struct nft_set *set, const struct nft_set_desc *desc, struct nft_expr *exprs[], u32 num_exprs, u32 flags) { int i; if (set->ktype != desc->ktype || set->dtype != desc->dtype || set->flags != flags || set->klen != desc->klen || set->dlen != desc->dlen || set->field_count != desc->field_count || set->num_exprs != num_exprs) return false; for (i = 0; i < desc->field_count; i++) { if (set->field_len[i] != desc->field_len[i]) return false; } for (i = 0; i < num_exprs; i++) { if (set->exprs[i]->ops != exprs[i]->ops) return false; } return true; } static u32 nft_set_kernel_size(const struct nft_set_ops *ops, const struct nft_set_desc *desc) { if (ops->ksize) return ops->ksize(desc->size); return desc->size; } static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; struct net *net = info->net; struct nft_set_desc desc; struct nft_table *table; unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; size_t alloc_size; int num_exprs = 0; char *name; int err, i; u16 udlen; u32 flags; u64 size; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || nla[NFTA_SET_KEY_LEN] == NULL || nla[NFTA_SET_ID] == NULL) return -EINVAL; memset(&desc, 0, sizeof(desc)); desc.ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK) return -EINVAL; } desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; flags = 0; if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == (NFT_SET_MAP | NFT_SET_OBJECT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) == (NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) == (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; } desc.dtype = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE])); if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK && desc.dtype != NFT_DATA_VERDICT) return -EINVAL; if (desc.dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; } else desc.dlen = sizeof(struct nft_verdict); } else if (flags & NFT_SET_MAP) return -EINVAL; if (nla[NFTA_SET_OBJ_TYPE] != NULL) { if (!(flags & NFT_SET_OBJECT)) return -EINVAL; desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); if (desc.objtype == NFT_OBJECT_UNSPEC || desc.objtype > NFT_OBJECT_MAX) return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else desc.objtype = NFT_OBJECT_UNSPEC; desc.timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; if (flags & NFT_SET_ANONYMOUS) return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; } desc.gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; if (flags & NFT_SET_ANONYMOUS) return -EOPNOTSUPP; desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } desc.policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) { desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); switch (desc.policy) { case NFT_SET_POL_PERFORMANCE: case NFT_SET_POL_MEMORY: break; default: return -EOPNOTSUPP; } } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; if (desc.field_count > 1) { if (!(flags & NFT_SET_CONCAT)) return -EINVAL; } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS]) desc.expr = true; table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); } } else { struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {}; if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (nft_set_is_anonymous(set)) return -EOPNOTSUPP; err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; if (desc.size) desc.size = nft_set_kernel_size(set->ops, &desc); err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); err = -EEXIST; } for (i = 0; i < num_exprs; i++) nft_expr_destroy(&ctx, exprs[i]); if (err < 0) return err; return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc); } if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); if (desc.size) desc.size = nft_set_kernel_size(ops, &desc); udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); size = 0; if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; if (!nft_use_inc(&table->use)) return -EMFILE; set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT); if (!set) { err = -ENOMEM; goto err_alloc; } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT); if (!name) { err = -ENOMEM; goto err_set_name; } err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) goto err_set_name; udata = NULL; if (udlen) { udata = set->data + size; nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); } INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; set->ktype = desc.ktype; set->klen = desc.klen; set->dtype = desc.dtype; set->objtype = desc.objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; set->policy = desc.policy; set->udlen = udlen; set->udata = udata; set->timeout = desc.timeout; set->gc_int = desc.gc_int; set->field_count = desc.field_count; for (i = 0; i < desc.field_count; i++) set->field_len[i] = desc.field_len[i]; err = ops->init(set, &desc, nla); if (err < 0) goto err_set_init; err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags); if (err < 0) goto err_set_destroy; set->num_exprs = num_exprs; set->handle = nf_tables_alloc_handle(table); INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); return 0; err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: kvfree(set); err_alloc: nft_use_dec_restore(&table->use); return err; } static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_elem_catchall *next, *catchall; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } static void nft_set_put(struct nft_set *set) { if (refcount_dec_and_test(&set->refs)) { kfree(set->name); kvfree(set); } } static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; if (WARN_ON(set->use > 0)) return; for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { if (PTR_ERR(set) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(set); } if (set->use || (info->nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delset(&ctx, set); } static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, enum nft_data_types type, unsigned int len); static int nft_setelem_data_validate(const struct nft_ctx *ctx, struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, set->dlen); } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (!nft_set_elem_active(ext, iter->genmask)) return 0; return nft_setelem_data_validate(ctx, set, elem_priv); } static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; ret = nft_setelem_data_validate(ctx, set, catchall->elem); if (ret < 0) break; } return ret; } int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; struct nft_set_iter iter; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; if (binding->flags & NFT_SET_MAP) { /* If the set is already bound to the same chain all * jumps are already validated for that chain. */ list_for_each_entry(i, &set->bindings, list) { if (i->flags & NFT_SET_MAP && i->chain == binding->chain) goto bind; } iter.genmask = nft_genmask_next(ctx->net); iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_bind_check(ctx, set); if (iter.err < 0) return iter.err; } bind: if (!nft_use_inc(&set->use)) return -EMFILE; binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); return 0; } EXPORT_SYMBOL_GPL(nf_tables_bind_set); static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, bool event) { list_del_rcu(&binding->list); if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); } } static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); static int nft_mapelem_activate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); /* called from abort path, reverse check to undo changes. */ if (nft_set_elem_active(ext, iter->genmask)) return 0; nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; } static void nft_map_catchall_activate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } } static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; set->ops->walk(ctx, set, &iter); WARN_ON_ONCE(iter.err); nft_map_catchall_activate(ctx, set); } void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { if (nft_set_is_anonymous(set)) { if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_activate(ctx, set); nft_clear(ctx->net, set); } nft_use_inc_restore(&set->use); } EXPORT_SYMBOL_GPL(nf_tables_activate_set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { lockdep_commit_lock_is_held(ctx->net); switch (phase) { case NFT_TRANS_PREPARE_ERROR: nft_set_trans_unbind(ctx, set); if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); else list_del_rcu(&binding->list); nft_use_dec(&set->use); break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) { if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); } nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: if (nft_set_is_anonymous(set) && set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); nft_use_dec(&set->use); fallthrough; default: nf_tables_unbind_set(ctx, set, binding, phase == NFT_TRANS_COMMIT); } } EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) nft_set_destroy(ctx, set); } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { .align = __alignof__(u32), }, [NFT_SET_EXT_DATA] = { .align = __alignof__(u32), }, [NFT_SET_EXT_EXPRESSIONS] = { .align = __alignof__(struct nft_set_elem_expr), }, [NFT_SET_EXT_OBJREF] = { .len = sizeof(struct nft_object *), .align = __alignof__(struct nft_object *), }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { .len = sizeof(struct nft_timeout), .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, [NFT_SET_EXT_KEY_END] = { .align = __alignof__(u32), }, }; /* * Set elements */ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy), [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_ext *ext, bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; struct nft_expr *expr; struct nlattr *nest; elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) num_exprs++; if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; } else if (num_exprs > 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS); if (nest == NULL) goto nla_put_failure; nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -1; } static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nla_put_string(skb, NFTA_SET_ELEM_OBJREF, (*nft_set_ext_obj(ext))->key.name) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); u64 set_timeout = READ_ONCE(set->timeout); __be64 msecs = 0; if (set_timeout != timeout) { msecs = nf_jiffies64_to_msecs(timeout); if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, NFTA_SET_ELEM_PAD)) goto nla_put_failure; } if (timeout > 0) { u64 expires, now = get_jiffies_64(); expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); if (time_before64(now, expires)) expires -= now; else expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { struct nft_userdata *udata; udata = nft_set_ext_userdata(ext); if (nla_put(skb, NFTA_SET_ELEM_USERDATA, udata->len + 1, udata->data)) goto nla_put_failure; } nla_nest_end(skb, nest); return 0; nla_put_failure: nlmsg_trim(skb, b); return -EMSGSIZE; } struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; if (!nft_set_elem_active(ext, iter->genmask)) return 0; if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset); } static void audit_log_nft_set_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC); kfree(buf); } struct nft_set_dump_ctx { const struct nft_set *set; struct nft_ctx ctx; bool reset; }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, const struct nft_set *set, bool reset, unsigned int base_seq) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask) || nft_set_elem_expired(ext)) continue; ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset); if (reset && !ret) audit_log_nft_set_reset(set->table, base_seq, 1); break; } return ret; } static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; int event; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; if (table != dump_ctx->ctx.table) continue; list_for_each_entry_rcu(set, &table->sets, list) { if (set == dump_ctx->set) { set_found = true; break; } } break; } if (!set_found) { rcu_read_unlock(); return -ENOENT; } event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, table->family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; args.cb = cb; args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) args.iter.err = nft_set_catchall_dump(net, skb, set, dump_ctx->reset, cb->seq); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); rcu_read_unlock(); if (args.iter.err && args.iter.err != -EMSGSIZE) return args.iter.err; if (args.iter.count == cb->args[0]) return 0; cb->args[0] = args.iter.count; return skb->len; nla_put_failure: rcu_read_unlock(); return -ENOSPC; } static int nf_tables_dumpreset_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); struct nft_set_dump_ctx *dump_ctx = cb->data; int ret, skip = cb->args[0]; mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_set(skb, cb); if (cb->args[0] > skip) audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, cb->args[0] - skip); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC); return cb->data ? 0 : -ENOMEM; } static int nf_tables_dump_set_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, NFNETLINK_V0, nft_base_seq(ctx->net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; err = nf_tables_fill_setelem(skb, set, elem_priv, reset); if (err < 0) goto nla_put_failure; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static int nft_setelem_parse_flags(const struct nft_set *set, const struct nlattr *attr, u32 *flags) { if (attr == NULL) return 0; *flags = ntohl(nla_get_be32(attr)); if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) return -EOPNOTSUPP; if (!(set->flags & NFT_SET_INTERVAL) && *flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) == (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) return -EINVAL; return 0; } static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set, struct nft_data *key, struct nlattr *attr) { struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = NFT_DATA_VALUE_MAXLEN, .len = set->klen, }; return nft_data_init(ctx, key, &desc, attr); } static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc *desc, struct nft_data *data, struct nlattr *attr) { u32 dtype; if (set->dtype == NFT_DATA_VERDICT) dtype = NFT_DATA_VERDICT; else dtype = NFT_DATA_VALUE; desc->type = dtype; desc->size = NFT_DATA_VALUE_MAXLEN; desc->len = set->dlen; desc->flags = NFT_DATA_DESC_SETELEM; return nft_data_init(ctx, data, desc, attr); } static void *nft_setelem_catchall_get(const struct net *net, const struct nft_set *set) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; void *priv = NULL; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask) || nft_set_elem_expired(ext)) continue; priv = catchall->elem; break; } return priv; } static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_elem *elem, u32 flags) { void *priv; if (!(flags & NFT_SET_ELEM_CATCHALL)) { priv = set->ops->get(ctx->net, set, elem, flags); if (IS_ERR(priv)) return PTR_ERR(priv); } else { priv = nft_setelem_catchall_get(ctx->net, set); if (!priv) return -ENOENT; } elem->priv = priv; return 0; } static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) return err; } err = nft_setelem_get(ctx, set, &elem, flags); if (err < 0) return err; err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, NFT_MSG_NEWSETELEM, 0, set, elem.priv, reset); if (err < 0) goto err_fill_setelem; return nfnetlink_unicast(skb, ctx->net, ctx->portid); err_fill_setelem: kfree_skb(skb); return err; } static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, const struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; struct nft_table *table; struct nft_set *set; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } nft_ctx_init(&dump_ctx->ctx, net, skb, info->nlh, family, table, NULL, nla); dump_ctx->set = set; dump_ctx->reset = reset; return 0; } /* called with rcu_read_lock held */ static int nf_tables_getsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; struct nlattr *attr; int rem, err = 0; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); if (err) return err; c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); if (err) return err; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } } return err; } static int nf_tables_getsetelem_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; int rem, err = 0, nelems = 0; struct nlattr *attr; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dumpreset_set, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); if (err) return err; c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); rcu_read_lock(); err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); if (err) goto out_unlock; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } nelems++; } audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); out_unlock: rcu_read_unlock(); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); return err; } static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv, int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem_priv, false); if (err < 0) { kfree_skb(skb); goto err; } nft_net = nft_pernet(net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { struct nft_trans_elem *te; struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; te = nft_trans_container_elem(trans); te->nelems = 1; te->set = set; return trans; } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr) { struct nft_expr *expr; int err; expr = nft_expr_init(ctx, attr); if (IS_ERR(expr)) return expr; err = -EOPNOTSUPP; if (expr->ops->type->flags & NFT_EXPR_GC) { if (set->flags & NFT_SET_TIMEOUT) goto err_set_elem_expr; if (!set->ops->gc_init) goto err_set_elem_expr; set->ops->gc_init(set); } return expr; err_set_elem_expr: nft_expr_destroy(ctx, expr); return ERR_PTR(err); } static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len) { len += nft_set_ext_types[id].len; if (len > tmpl->ext_len[id] || len > U8_MAX) return -1; return 0; } static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, void *to, const void *from, u32 len) { if (nft_set_ext_check(tmpl, id, len) < 0) return -1; memcpy(to, from, len); return 0; } struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); if (elem == NULL) return ERR_PTR(-ENOMEM); ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY, nft_set_ext_key(ext), key, set->klen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END, nft_set_ext_key_end(ext), key_end, set->klen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA, nft_set_ext_data(ext), data, set->dlen) < 0) goto err_ext_check; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { nft_set_ext_timeout(ext)->timeout = timeout; if (expiration == 0) expiration = timeout; nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } return elem; err_ext_check: kfree(elem); return ERR_PTR(-EINVAL); } static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { if (expr->ops->destroy_clone) { expr->ops->destroy_clone(ctx, expr); module_put(expr->ops->type->owner); } else { nf_tables_expr_destroy(ctx, expr); } } static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, struct nft_set_elem_expr *elem_expr) { struct nft_expr *expr; u32 size; nft_setelem_expr_foreach(expr, elem_expr, size) __nft_set_elem_expr_destroy(ctx, expr); } /* Drop references and destroy. Called from gc, dynset and abort path. */ static void __nft_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem_priv); } /* Drop references and destroy. Called from gc and dynset. */ void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr) { struct nft_ctx ctx = { .net = read_pnet(&set->net), .family = set->table->family, }; __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); /* Drop references and destroy. Called from abort path. */ static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { /* skip update request, see nft_trans_elems_new_abort() */ if (!te->elems[i].priv) continue; __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); } } /* Destroy element. References have been already dropped in the preparation * path via nft_setelem_data_deactivate(). */ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem_priv); } static void nft_trans_elems_destroy(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); } int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]) { struct nft_expr *expr; int err, i, k; for (i = 0; i < set->num_exprs; i++) { expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT); if (!expr) goto err_expr; err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; } expr_array[i] = expr; } return 0; err_expr: for (k = i - 1; k >= 0; k--) nft_expr_destroy(ctx, expr_array[k]); return -ENOMEM; } static int nft_set_elem_expr_setup(struct nft_ctx *ctx, const struct nft_set_ext_tmpl *tmpl, const struct nft_set_ext *ext, struct nft_expr *expr_array[], u32 num_exprs) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); u32 len = sizeof(struct nft_set_elem_expr); struct nft_expr *expr; int i, err; if (num_exprs == 0) return 0; for (i = 0; i < num_exprs; i++) len += expr_array[i]->ops->size; if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0) return -EINVAL; for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; elem_expr->size += expr_array[i]->ops->size; nft_expr_destroy(ctx, expr_array[i]); expr_array[i] = NULL; } return 0; err_elem_expr_setup: for (; i < num_exprs; i++) { nft_expr_destroy(ctx, expr_array[i]); expr_array[i] = NULL; } return -ENOMEM; } struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); struct nft_set_ext *ext; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && !nft_set_elem_expired(ext) && !nft_set_elem_is_dead(ext)) return ext; } return NULL; } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_next(net); struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask)) { *priv = catchall->elem; return -EEXIST; } } catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; catchall->elem = elem->priv; list_add_tail_rcu(&catchall->list, &set->catchall_list); return 0; } static int nft_setelem_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv, unsigned int flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) ret = nft_setelem_catchall_insert(net, set, elem, elem_priv); else ret = set->ops->insert(net, set, elem, elem_priv); return ret; } static bool nft_setelem_is_catchall(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) return true; return false; } static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } } static void nft_trans_elem_update(const struct nft_set *set, const struct nft_trans_one_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); const struct nft_elem_update *update = elem->update; if (update->flags & NFT_TRANS_UPD_TIMEOUT) WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); if (update->flags & NFT_TRANS_UPD_EXPIRATION) WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); } static void nft_trans_elems_add(const struct nft_ctx *ctx, struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { struct nft_trans_one_elem *elem = &te->elems[i]; if (elem->update) nft_trans_elem_update(te->set, elem); else nft_setelem_activate(ctx->net, te->set, elem->priv); nf_tables_setelem_notify(ctx, te->set, elem->priv, NFT_MSG_NEWSETELEM); kfree(elem->update); } } static int nft_setelem_catchall_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) { struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_is_active_next(net, ext)) continue; kfree(elem->priv); elem->priv = catchall->elem; nft_set_elem_change_active(net, set, ext); return 0; } return -ENOENT; } static int __nft_setelem_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) { void *priv; priv = set->ops->deactivate(net, set, elem); if (!priv) return -ENOENT; kfree(elem->priv); elem->priv = priv; set->ndeact++; return 0; } static int nft_setelem_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem, u32 flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) ret = nft_setelem_catchall_deactivate(net, set, elem); else ret = __nft_setelem_deactivate(net, set, elem); return ret; } static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) { list_del_rcu(&catchall->list); kfree_rcu(catchall, rcu); } static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_set_elem_catchall *catchall, *next; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { if (catchall->elem == elem_priv) { nft_setelem_catchall_destroy(catchall); break; } } } static void nft_setelem_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { if (nft_setelem_is_catchall(set, elem_priv)) nft_setelem_catchall_remove(net, set, elem_priv); else set->ops->remove(net, set, elem_priv); } static void nft_trans_elems_remove(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { WARN_ON_ONCE(te->elems[i].update); nf_tables_setelem_notify(ctx, te->set, te->elems[i].priv, te->nft_trans.msg_type); nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { atomic_dec(&te->set->nelems); te->set->ndeact--; } } } static bool nft_setelem_valid_key_end(const struct nft_set *set, struct nlattr **nla, u32 flags) { if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) == (NFT_SET_CONCAT | NFT_SET_INTERVAL)) { if (flags & NFT_SET_ELEM_INTERVAL_END) return false; if (nla[NFTA_SET_ELEM_KEY_END] && flags & NFT_SET_ELEM_CATCHALL) return false; } else { if (nla[NFTA_SET_ELEM_KEY_END]) return false; } return true; } static u32 nft_set_maxsize(const struct nft_set *set) { u32 maxsize, delta; if (!set->size) return UINT_MAX; if (set->ops->adjust_maxsize) delta = set->ops->adjust_maxsize(set); else delta = 0; if (check_add_overflow(set->size, set->ndeact, &maxsize)) return UINT_MAX; if (check_add_overflow(maxsize, delta, &maxsize)) return UINT_MAX; return maxsize; } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); u32 flags = 0, size = 0, num_exprs = 0; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_elem_priv *elem_priv; struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; u64 expiration; u64 timeout; int err, i; u8 ulen; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; nft_set_ext_prepare(&tmpl); err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (err < 0) return err; } if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } if (set->flags & NFT_SET_OBJECT) { if (!nla[NFTA_SET_ELEM_OBJREF] && !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; } else { if (nla[NFTA_SET_ELEM_OBJREF]) return -EINVAL; } if (!nft_setelem_valid_key_end(set, nla, flags)) return -EINVAL; if ((flags & NFT_SET_ELEM_INTERVAL_END) && (nla[NFTA_SET_ELEM_DATA] || nla[NFTA_SET_ELEM_OBJREF] || nla[NFTA_SET_ELEM_TIMEOUT] || nla[NFTA_SET_ELEM_EXPIRATION] || nla[NFTA_SET_ELEM_USERDATA] || nla[NFTA_SET_ELEM_EXPR] || nla[NFTA_SET_ELEM_KEY_END] || nla[NFTA_SET_ELEM_EXPRESSIONS])) return -EINVAL; timeout = 0; if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT], &timeout); if (err) return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { timeout = set->timeout; } expiration = 0; if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; if (timeout == 0) return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; if (expiration > timeout) return -ERANGE; } if (nla[NFTA_SET_ELEM_EXPR]) { struct nft_expr *expr; if (set->num_exprs && set->num_exprs != 1) return -EOPNOTSUPP; expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); expr_array[0] = expr; num_exprs = 1; if (set->num_exprs && set->exprs[0]->ops != expr->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) { struct nft_expr *expr; struct nlattr *tmp; int left; i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX || (set->num_exprs && set->num_exprs == i)) { err = -E2BIG; goto err_set_elem_expr; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; goto err_set_elem_expr; } expr = nft_set_elem_expr_alloc(ctx, set, tmp); if (IS_ERR(expr)) { err = PTR_ERR(expr); goto err_set_elem_expr; } expr_array[i] = expr; num_exprs++; if (set->num_exprs && expr->ops != set->exprs[i]->ops) { err = -EOPNOTSUPP; goto err_set_elem_expr; } i++; } if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_set_elem_expr; } } else if (set->num_exprs > 0 && !(flags & NFT_SET_ELEM_INTERVAL_END)) { err = nft_set_elem_expr_clone(ctx, set, expr_array); if (err < 0) goto err_set_elem_expr_clone; num_exprs = set->num_exprs; } if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err_set_elem_expr; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (err < 0) goto err_parse_key; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) goto err_parse_key; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); if (err < 0) goto err_parse_key_end; } if (set->flags & NFT_SET_TIMEOUT) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; } if (num_exprs) { for (i = 0; i < num_exprs; i++) size += expr_array[i]->ops->size; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS, sizeof(struct nft_set_elem_expr) + size); if (err < 0) goto err_parse_key_end; } if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); obj = NULL; goto err_parse_key_end; } if (!nft_use_inc(&obj->use)) { err = -EMFILE; obj = NULL; goto err_parse_key_end; } err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); if (err < 0) goto err_parse_key_end; } if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err_parse_key_end; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { .net = ctx->net, .family = ctx->family, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, }; if (!(binding->flags & NFT_SET_MAP)) continue; err = nft_validate_register_store(&bind_ctx, dreg, &elem.data.val, desc.type, desc.len); if (err < 0) goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && (elem.data.val.verdict.code == NFT_GOTO || elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->table, NFT_VALIDATE_NEED); } err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); if (err < 0) goto err_parse_data; } /* The full maximum length of userdata can exceed the maximum * offset value (U8_MAX) for following extensions, therefor it * must be the last extension added. */ ulen = 0; if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); if (ulen > 0) { err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, ulen); if (err < 0) goto err_parse_data; } } elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL_ACCOUNT); if (IS_ERR(elem.priv)) { err = PTR_ERR(elem.priv); goto err_parse_data; } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; if (obj) *nft_set_ext_obj(ext) = obj; if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; goto err_elem_free; } ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags); if (err) { if (err == -EEXIST) { ext2 = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) goto err_element_clash; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), nft_set_ext_data(ext2), set->dlen) != 0) || (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) goto err_element_clash; else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { struct nft_elem_update update = { }; if (timeout != nft_set_ext_timeout(ext2)->timeout) { update.timeout = timeout; if (expiration == 0) expiration = timeout; update.flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { update.expiration = expiration; update.flags |= NFT_TRANS_UPD_EXPIRATION; } if (update.flags) { struct nft_trans_one_elem *ue; ue = &nft_trans_container_elem(trans)->elems[0]; ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); if (!ue->update) { err = -ENOMEM; goto err_element_clash; } ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } } } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. */ err = -EEXIST; } goto err_element_clash; } if (!(flags & NFT_SET_ELEM_CATCHALL)) { unsigned int max = nft_set_maxsize(set); if (!atomic_add_unless(&set->nelems, 1, max)) { err = -ENFILE; goto err_set_full; } } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; err_set_full: nft_setelem_remove(ctx->net, set, elem.priv); err_element_clash: kfree(trans); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: if (obj) nft_use_dec_restore(&obj->use); nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); err_set_elem_expr: for (i = 0; i < num_exprs && expr_array[i]; i++) nft_expr_destroy(ctx, expr_array[i]); err_set_elem_expr_clone: return err; } static int nf_tables_newsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } if (!list_empty(&set->bindings) && (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; } } if (table->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); return 0; } /** * nft_data_hold - hold a nft_data item * * @data: struct nft_data to release * @type: type of data * * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded, * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and * NFT_GOTO verdicts. This function must be called on active data objects * from the second phase of the commit protocol. */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; nft_use_inc_restore(&chain->use); break; } } } static int nft_setelem_active_next(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); u8 genmask = nft_genmask_next(net); return nft_set_elem_active(ext, genmask); } static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); } /* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. * No notifications and no ndeact changes. * * Returns true if set had been added to (i.e., elements need to be removed again). */ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, struct nft_trans_elem *te) { bool removed = false; int i; for (i = 0; i < te->nelems; i++) { if (te->elems[i].update) { kfree(te->elems[i].update); te->elems[i].update = NULL; /* Update request, so do not release this element */ te->elems[i].priv = NULL; continue; } if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) atomic_dec(&te->set->nelems); removed = true; } return removed; } /* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, const struct nft_trans_elem *te) { int i; for (i = 0; i < te->nelems; i++) { if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); } if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) te->set->ndeact--; } } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; u32 flags = 0; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) return err; err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; if (!nft_setelem_valid_key_end(set, nla, flags)) return -EINVAL; nft_set_ext_prepare(&tmpl); if (flags != 0) { err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (err < 0) return err; } if (nla[NFTA_SET_ELEM_KEY]) { err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (err < 0) goto fail_elem; } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, nla[NFTA_SET_ELEM_KEY_END]); if (err < 0) goto fail_elem; err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); if (err < 0) goto fail_elem_key_end; } err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, elem.key_end.val.data, NULL, 0, 0, GFP_KERNEL_ACCOUNT); if (IS_ERR(elem.priv)) { err = PTR_ERR(elem.priv); goto fail_elem_key_end; } ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) goto fail_trans; err = nft_setelem_deactivate(ctx->net, set, &elem, flags); if (err < 0) goto fail_ops; nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; fail_ops: kfree(trans); fail_trans: kfree(elem.priv); fail_elem_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); fail_elem: nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } static int nft_setelem_flush(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; if (!nft_set_elem_active(ext, iter->genmask)) return 0; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, struct_size_t(struct nft_trans_elem, elems, 1), GFP_ATOMIC); if (!trans) return -ENOMEM; set->ops->flush(ctx->net, set, elem_priv); set->ndeact++; nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; } static int __nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_trans *trans; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; } static int nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list, lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; ret = __nft_set_catchall_flush(ctx, set, catchall->elem); if (ret < 0) break; nft_set_elem_change_active(ctx->net, set, ext); } return ret; } static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_flush(ctx, set); return iter.err; } static int nf_tables_delsetelem(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); } set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); } if (nft_set_is_anonymous(set)) return -EOPNOTSUPP; if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return nft_set_flush(&ctx, set, genmask); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) continue; if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; } } return 0; } /* * Stateful objects */ /** * nft_register_obj- register nf_tables stateful object type * @obj_type: object type * * Registers the object type for use with nf_tables. Returns zero on * success or a negative errno code otherwise. */ int nft_register_obj(struct nft_object_type *obj_type) { if (obj_type->type == NFT_OBJECT_UNSPEC) return -EINVAL; nfnl_lock(NFNL_SUBSYS_NFTABLES); list_add_rcu(&obj_type->list, &nf_tables_objects); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } EXPORT_SYMBOL_GPL(nft_register_obj); /** * nft_unregister_obj - unregister nf_tables object type * @obj_type: object type * * Unregisters the object type for use with nf_tables. */ void nft_unregister_obj(struct nft_object_type *obj_type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&obj_type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_obj); struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask) { struct nft_object_hash_key k = { .table = table }; char search[NFT_OBJ_MAXNAMELEN]; struct rhlist_head *tmp, *list; struct nft_object *obj; nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); rcu_read_lock(); list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params); if (!list) goto out; rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) { if (objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) { rcu_read_unlock(); return obj; } } out: rcu_read_unlock(); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(nft_obj_lookup); static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask) { struct nft_object *obj; list_for_each_entry(obj, &table->objects, list) { if (be64_to_cpu(nla_get_be64(nla)) == obj->handle && objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) return obj; } return ERR_PTR(-ENOENT); } static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TABLE] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_OBJ_NAME] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr) { struct nlattr **tb; const struct nft_object_ops *ops; struct nft_object *obj; int err = -ENOMEM; tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL); if (!tb) goto err1; if (attr) { err = nla_parse_nested_deprecated(tb, type->maxattr, attr, type->policy, NULL); if (err < 0) goto err2; } else { memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); } if (type->select_ops) { ops = type->select_ops(ctx, (const struct nlattr * const *)tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); goto err2; } } else { ops = type->ops; } err = -ENOMEM; obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT); if (!obj) goto err2; err = ops->init(ctx, (const struct nlattr * const *)tb, obj); if (err < 0) goto err3; obj->ops = ops; kfree(tb); return obj; err3: kfree(obj); err2: kfree(tb); err1: return ERR_PTR(err); } static int nft_object_dump(struct sk_buff *skb, unsigned int attr, struct nft_object *obj, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (obj->ops->dump(skb, obj, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; if (objtype == type->type) return type; } return NULL; } static const struct nft_object_type * nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; rcu_read_lock(); type = __nft_obj_type_get(objtype, family); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } static int nf_tables_updobj(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr, struct nft_object *obj) { struct nft_object *newobj; struct nft_trans *trans; int err = -ENOMEM; /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) goto err_trans; newobj = nft_obj_init(ctx, type, attr); if (IS_ERR(newobj)) { err = PTR_ERR(newobj); goto err_free_trans; } nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_free_trans: kfree(trans); err_trans: module_put(type->owner); return err; } static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; struct net *net = info->net; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; int err; if (!nla[NFTA_OBJ_TYPE] || !nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_DATA]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); if (err != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return err; } } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (!obj->ops->update) return 0; type = nft_obj_type_get(net, objtype, family); if (WARN_ON_ONCE(IS_ERR(type))) return PTR_ERR(type); nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nft_use_inc(&table->use)) return -EMFILE; type = nft_obj_type_get(net, objtype, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err_type; } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT); if (!obj->key.name) { err = -ENOMEM; goto err_strdup; } if (nla[NFTA_OBJ_USERDATA]) { obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]); } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); return 0; err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; err_trans: kfree(obj->udata); err_userdata: kfree(obj->key.name); err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); err_init: module_put(type->owner); err_type: nft_use_dec_restore(&table->use); return err; } static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), NFTA_OBJ_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELOBJ) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) goto nla_put_failure; if (obj->udata && nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } static void audit_log_obj_reset(const struct nft_table *table, unsigned int base_seq, unsigned int nentries) { char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq); audit_log_nfcfg(buf, table->family, nentries, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); } struct nft_obj_dump_ctx { unsigned int s_idx; char *table; u32 type; bool reset; }; static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int entries = 0; struct nft_object *obj; unsigned int idx = 0; int rc = 0; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; entries = 0; list_for_each_entry_rcu(obj, &table->objects, list) { if (!nft_is_active(net, obj)) goto cont; if (idx < ctx->s_idx) goto cont; if (ctx->table && strcmp(ctx->table, table->name)) goto cont; if (ctx->type != NFT_OBJECT_UNSPEC && obj->ops->type->type != ctx->type) goto cont; rc = nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, NLM_F_MULTI | NLM_F_APPEND, table->family, table, obj, ctx->reset); if (rc < 0) break; entries++; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } if (ctx->reset && entries) audit_log_obj_reset(table, nft_net->base_seq, entries); if (rc < 0) break; } rcu_read_unlock(); ctx->s_idx = idx; return skb->len; } static int nf_tables_dumpreset_obj(struct sk_buff *skb, struct netlink_callback *cb) { struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); int ret; mutex_lock(&nft_net->commit_mutex); ret = nf_tables_dump_obj(skb, cb); mutex_unlock(&nft_net->commit_mutex); return ret; } static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (nla[NFTA_OBJ_TABLE]) { ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); if (!ctx->table) return -ENOMEM; } if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); return 0; } static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; ctx->reset = true; return nf_tables_dump_obj_start(cb); } static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; kfree(ctx->table); return 0; } /* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; u32 objtype; int err; if (!nla[NFTA_OBJ_NAME] || !nla[NFTA_OBJ_TYPE]) return ERR_PTR(-EINVAL); table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return ERR_CAST(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return ERR_CAST(obj); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); err = nf_tables_fill_obj_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) { kfree_skb(skb2); return ERR_PTR(err); } return skb2; } static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; struct sk_buff *skb2; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_obj_start, .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } skb2 = nf_tables_getobj_single(portid, info, nla, false); if (IS_ERR(skb2)) return PTR_ERR(skb2); return nfnetlink_unicast(skb2, info->net, portid); } static int nf_tables_getobj_reset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dumpreset_obj_start, .dump = nf_tables_dumpreset_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); mutex_lock(&nft_net->commit_mutex); skb2 = nf_tables_getobj_single(portid, info, nla, true); mutex_unlock(&nft_net->commit_mutex); rcu_read_lock(); module_put(THIS_MODULE); if (IS_ERR(skb2)) return PTR_ERR(skb2); buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), nft_net->base_seq); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); return nfnetlink_unicast(skb2, net, portid); } static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { if (obj->ops->destroy) obj->ops->destroy(ctx, obj); module_put(obj->ops->type->owner); kfree(obj->key.name); kfree(obj->udata); kfree(obj); } static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; u32 objtype; if (!nla[NFTA_OBJ_TYPE] || (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); if (nla[NFTA_OBJ_HANDLE]) { attr = nla[NFTA_OBJ_HANDLE]; obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask); } else { attr = nla[NFTA_OBJ_NAME]; obj = nft_obj_lookup(net, table, attr, objtype, genmask); } if (IS_ERR(obj)) { if (PTR_ERR(obj) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); } if (obj->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } static void __nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, gfp); if (skb == NULL) goto err; err = nf_tables_fill_obj_info(skb, net, portid, seq, event, flags & (NLM_F_CREATE | NLM_F_EXCL), family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, obj->handle, event == NFT_MSG_NEWOBJ ? AUDIT_NFT_OP_OBJ_REGISTER : AUDIT_NFT_OP_OBJ_UNREGISTER, gfp); kfree(buf); __nft_obj_notify(net, table, obj, portid, seq, event, flags, family, report, gfp); } EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* * Flow tables */ void nft_register_flowtable_type(struct nf_flowtable_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_add_tail_rcu(&type->list, &nf_tables_flowtables); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_register_flowtable_type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type); static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_TABLE] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, [NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED }, [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; struct nft_flowtable *nft_flowtable_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; list_for_each_entry_rcu(flowtable, &table->flowtables, list, lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(nft_flowtable_lookup); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase) { switch (phase) { case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: nft_use_dec(&flowtable->use); fallthrough; default: return; } } EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable); static struct nft_flowtable * nft_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; list_for_each_entry(flowtable, &table->flowtables, list) { if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle && nft_active_genmask(flowtable, genmask)) return flowtable; } return ERR_PTR(-ENOENT); } struct nft_flowtable_hook { u32 num; int priority; struct list_head list; }; static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, }; static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, const struct nlattr * const nla[], struct nft_flowtable_hook *flowtable_hook, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; int hooknum, priority; int err; INIT_LIST_HEAD(&flowtable_hook->list); err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, nla[NFTA_FLOWTABLE_HOOK], nft_flowtable_hook_policy, NULL); if (err < 0) return err; if (add) { if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -ENOENT; } hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != NF_NETDEV_INGRESS) return -EOPNOTSUPP; priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); flowtable_hook->priority = priority; flowtable_hook->num = hooknum; } else { if (tb[NFTA_FLOWTABLE_HOOK_NUM]) { hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); if (hooknum != flowtable->hooknum) return -EOPNOTSUPP; } if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); if (priority != flowtable->data.priority) return -EOPNOTSUPP; } flowtable_hook->priority = flowtable->data.priority; flowtable_hook->num = flowtable->hooknum; } if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { err = nf_tables_parse_netdev_hooks(ctx->net, tb[NFTA_FLOWTABLE_HOOK_DEVS], &flowtable_hook->list, extack); if (err < 0) return err; } list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; hook->ops.hooknum = flowtable_hook->num; hook->ops.priority = flowtable_hook->priority; hook->ops.priv = &flowtable->data; hook->ops.hook = flowtable->data.type->hook; } return err; } /* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } return NULL; } static const struct nf_flowtable_type * nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; rcu_read_lock(); type = __nft_flowtable_type_get(family); if (type != NULL && try_module_get(type->owner)) { rcu_read_unlock(); return type; } rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN) return ERR_PTR(-EAGAIN); } #endif return ERR_PTR(-ENOENT); } /* Only called from error and netdev event paths. */ static void nft_unregister_flowtable_hook(struct net *net, struct nft_flowtable *flowtable, struct nft_hook *hook) { nf_unregister_net_hook(net, &hook->ops); flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); } static void __nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); } } } static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable, struct list_head *hook_list) { __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, struct list_head *hook_list, struct nft_flowtable *flowtable) { struct nft_hook *hook, *next; struct nft_flowtable *ft; int err, i = 0; list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { if (!nft_is_active_next(net, ft)) continue; if (nft_hook_list_find(&ft->hook_list, hook)) { err = -EEXIST; goto err_unregister_net_hooks; } } err = flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_BIND); if (err < 0) goto err_unregister_net_hooks; err = nf_register_net_hook(net, &hook->ops); if (err < 0) { flowtable->data.type->setup(&flowtable->data, hook->ops.dev, FLOW_BLOCK_UNBIND); goto err_unregister_net_hooks; } i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { if (i-- <= 0) break; nft_unregister_flowtable_hook(net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } return err; } static void nft_hooks_destroy(struct list_head *hook_list) { struct nft_hook *hook, *next; list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } } static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; struct nft_trans *trans; bool unregister = false; u32 flags; int err; err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, extack, false); if (err < 0) return err; list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); kfree(hook); } } if (nla[NFTA_FLOWTABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); if (flags & ~NFT_FLOWTABLE_MASK) { err = -EOPNOTSUPP; goto err_flowtable_update_hook; } if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) { err = -EOPNOTSUPP; goto err_flowtable_update_hook; } } else { flags = flowtable->data.flags; } err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, &flowtable_hook.list, flowtable); if (err < 0) goto err_flowtable_update_hook; trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE, sizeof(struct nft_trans_flowtable)); if (!trans) { unregister = true; err = -ENOMEM; goto err_flowtable_update_hook; } nft_trans_flowtable_flags(trans) = flags; nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (unregister) nft_unregister_flowtable_hook(ctx->net, flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } return err; } static int nf_tables_newflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; struct nft_flowtable *flowtable; struct net *net = info->net; struct nft_table *table; struct nft_trans *trans; struct nft_ctx ctx; int err; if (!nla[NFTA_FLOWTABLE_TABLE] || !nla[NFTA_FLOWTABLE_NAME] || !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); if (err != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return err; } } else { if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_flowtable_update(&ctx, info->nlh, flowtable, extack); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (!nft_use_inc(&table->use)) return -EMFILE; flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); if (!flowtable) { err = -ENOMEM; goto flowtable_alloc; } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); INIT_LIST_HEAD(&flowtable->hook_list); flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT); if (!flowtable->name) { err = -ENOMEM; goto err1; } type = nft_flowtable_type_get(net, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err2; } if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) { err = -EOPNOTSUPP; goto err3; } } write_pnet(&flowtable->data.net, net); flowtable->data.type = type; err = type->init(&flowtable->data); if (err < 0) goto err3; err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, extack, true); if (err < 0) goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto err_flowtable_trans; } /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); if (err < 0) goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); return 0; err_flowtable_hooks: nft_trans_destroy(trans); err_flowtable_trans: nft_hooks_destroy(&flowtable->hook_list); err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); err2: kfree(flowtable->name); err1: kfree(flowtable); flowtable_alloc: nft_use_dec_restore(&table->use); return err; } static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook) { struct nft_hook *this, *next; list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); kfree(this); } } static int nft_delflowtable_hook(struct nft_ctx *ctx, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; LIST_HEAD(flowtable_del_list); struct nft_hook *this, *hook; struct nft_trans *trans; int err; err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable, extack, false); if (err < 0) return err; list_for_each_entry(this, &flowtable_hook.list, list) { hook = nft_hook_list_find(&flowtable->hook_list, this); if (!hook) { err = -ENOENT; goto err_flowtable_del_hook; } list_move(&hook->list, &flowtable_del_list); } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, sizeof(struct nft_trans_flowtable)); if (!trans) { err = -ENOMEM; goto err_flowtable_del_hook; } nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans)); nft_flowtable_hook_release(&flowtable_hook); nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_flowtable_del_hook: list_splice(&flowtable_del_list, &flowtable->hook_list); nft_flowtable_hook_release(&flowtable_hook); return err; } static int nf_tables_delflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; if (!nla[NFTA_FLOWTABLE_TABLE] || (!nla[NFTA_FLOWTABLE_NAME] && !nla[NFTA_FLOWTABLE_HANDLE])) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, NETLINK_CB(skb).portid); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } if (nla[NFTA_FLOWTABLE_HANDLE]) { attr = nla[NFTA_FLOWTABLE_HANDLE]; flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { if (PTR_ERR(flowtable) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE) return 0; NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) return nft_delflowtable_hook(&ctx, flowtable, extack); if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } return nft_delflowtable(&ctx, flowtable); } static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, struct nft_flowtable *flowtable, struct list_head *hook_list) { struct nlattr *nest, *nest_devs; struct nft_hook *hook; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle), NFTA_FLOWTABLE_PAD)) goto nla_put_failure; if (event == NFT_MSG_DELFLOWTABLE && !hook_list) { nlmsg_end(skb, nlh); return 0; } if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority))) goto nla_put_failure; nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; if (!hook_list) hook_list = &flowtable->hook_list; list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { if (nla_put(skb, NFTA_DEVICE_NAME, hook->ifnamelen, hook->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -1; } struct nft_flowtable_filter { char *table; }; static int nf_tables_dump_flowtable(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nft_flowtable_filter *filter = cb->data; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; const struct nft_table *table; rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; list_for_each_entry_rcu(flowtable, &table->flowtables, list) { if (!nft_is_active(net, flowtable)) goto cont; if (idx < s_idx) goto cont; if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (filter && filter->table && strcmp(filter->table, table->name)) goto cont; if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, flowtable, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static int nf_tables_dump_flowtable_start(struct netlink_callback *cb) { const struct nlattr * const *nla = cb->data; struct nft_flowtable_filter *filter = NULL; if (nla[NFTA_FLOWTABLE_TABLE]) { filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return -ENOMEM; filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], GFP_ATOMIC); if (!filter->table) { kfree(filter); return -ENOMEM; } } cb->data = filter; return 0; } static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) { struct nft_flowtable_filter *filter = cb->data; if (!filter) return 0; kfree(filter->table); kfree(filter); return 0; } /* called with rcu_read_lock held */ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, .done = nf_tables_dump_flowtable_done, .module = THIS_MODULE, .data = (void *)nla, }; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); } flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return PTR_ERR(flowtable); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, NULL); if (err < 0) goto err_fill_flowtable_info; return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); err_fill_flowtable_info: kfree_skb(skb2); return err; } static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, struct list_head *hook_list, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) goto err; if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; } nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { struct nft_hook *hook, *next; flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } kfree(flowtable->name); module_put(flowtable->data.type->owner); kfree(flowtable); } static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_trim(skb, nlh); return -EMSGSIZE; } static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { struct nft_hook *hook; list_for_each_entry(hook, &flowtable->hook_list, list) { if (hook->ops.dev != dev) continue; /* flow_offload_netdev_event() cleans up entries for us. */ nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); break; } } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; struct nft_table *table; struct net *net; if (event != NETDEV_UNREGISTER) return 0; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nf_tables_flowtable_notifier = { .notifier_call = nf_tables_flowtable_event, }; static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; int err; if (!nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) goto err; err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq); if (err < 0) { kfree_skb(skb2); goto err; } nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); return; err: nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, -ENOBUFS); } static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct sk_buff *skb2; int err; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq); if (err < 0) goto err_fill_gen_info; return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); err_fill_gen_info: kfree_skb(skb2); return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call = nf_tables_newtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { .call = nf_tables_gettable, .type = NFNL_CB_RCU, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { .call = nf_tables_deltable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DESTROYTABLE] = { .call = nf_tables_deltable, .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { .call = nf_tables_newchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { .call = nf_tables_getchain, .type = NFNL_CB_RCU, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { .call = nf_tables_delchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DESTROYCHAIN] = { .call = nf_tables_delchain, .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_NEWRULE] = { .call = nf_tables_newrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { .call = nf_tables_getrule, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { .call = nf_tables_getrule_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DELRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DESTROYRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { .call = nf_tables_newset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { .call = nf_tables_getset, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { .call = nf_tables_delset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DESTROYSET] = { .call = nf_tables_delset, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { .call = nf_tables_newsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { .call = nf_tables_getsetelem, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM_RESET] = { .call = nf_tables_getsetelem_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DESTROYSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, .type = NFNL_CB_RCU, }, [NFT_MSG_NEWOBJ] = { .call = nf_tables_newobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { .call = nf_tables_getobj, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DELOBJ] = { .call = nf_tables_delobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DESTROYOBJ] = { .call = nf_tables_delobj, .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { .call = nf_tables_getobj_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_NEWFLOWTABLE] = { .call = nf_tables_newflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { .call = nf_tables_getflowtable, .type = NFNL_CB_RCU, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DELFLOWTABLE] = { .call = nf_tables_delflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DESTROYFLOWTABLE] = { .call = nf_tables_delflowtable, .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, }; static int nf_tables_validate(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; list_for_each_entry(table, &nft_net->tables, list) { switch (table->validate_state) { case NFT_VALIDATE_SKIP: continue; case NFT_VALIDATE_NEED: nft_validate_state_update(table, NFT_VALIDATE_DO); fallthrough; case NFT_VALIDATE_DO: if (nft_table_validate(net, table) < 0) return -EAGAIN; nft_validate_state_update(table, NFT_VALIDATE_SKIP); break; } } return 0; } /* a drop policy has to be deferred until all rules have been activated, * otherwise a large ruleset that contains a drop-policy base chain will * cause all packets to get dropped until the full transaction has been * processed. * * We defer the drop policy until the transaction has been finalized. */ static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans) { struct nft_base_chain *basechain; if (trans->policy != NF_DROP) return; if (!nft_is_base_chain(trans->chain)) return; basechain = nft_base_chain(trans->chain); basechain->policy = NF_DROP; } static void nft_chain_commit_update(struct nft_trans_chain *trans) { struct nft_table *table = trans->nft_trans_binding.nft_trans.table; struct nft_base_chain *basechain; if (trans->name) { rhltable_remove(&table->chains_ht, &trans->chain->rhlhead, nft_chain_ht_params); swap(trans->chain->name, trans->name); rhltable_insert_key(&table->chains_ht, trans->chain->name, &trans->chain->rhlhead, nft_chain_ht_params); } if (!nft_is_base_chain(trans->chain)) return; nft_chain_stats_replace(trans); basechain = nft_base_chain(trans->chain); switch (trans->policy) { case NF_DROP: case NF_ACCEPT: basechain->policy = trans->policy; break; } } static void nft_obj_commit_update(const struct nft_ctx *ctx, struct nft_trans *trans) { struct nft_object *newobj; struct nft_object *obj; obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); if (WARN_ON_ONCE(!obj->ops->update)) return; obj->ops->update(obj, newobj); nft_obj_destroy(ctx, newobj); } static void nft_commit_release(struct nft_trans *trans) { struct nft_ctx ctx = { .net = trans->net, }; nft_ctx_update(&ctx, trans); switch (trans->msg_type) { case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) nft_hooks_destroy(&nft_trans_chain_hooks(trans)); else nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } if (trans->put_net) put_net(trans->net); kfree(trans); } static void nf_tables_trans_destroy_work(struct work_struct *w) { struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); list_splice_init(&nf_tables_destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) return; synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { nft_trans_list_del(trans); nft_commit_release(trans); } } void nf_tables_trans_destroy_flush_work(void) { flush_work(&trans_destroy_work); } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static bool nft_expr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { return false; } static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) { const struct nft_expr *expr, *last; struct nft_regs_track track = {}; unsigned int size, data_size; void *data, *data_boundary; struct nft_rule_dp *prule; struct nft_rule *rule; /* already handled or inactive chain? */ if (chain->blob_next || !nft_is_active_next(net, chain)) return 0; data_size = 0; list_for_each_entry(rule, &chain->rules, list) { if (nft_is_active_next(net, rule)) { data_size += sizeof(*prule) + rule->dlen; if (data_size > INT_MAX) return -ENOMEM; } } chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size); if (!chain->blob_next) return -ENOMEM; data = (void *)chain->blob_next->data; data_boundary = data + data_size; size = 0; list_for_each_entry(rule, &chain->rules, list) { if (!nft_is_active_next(net, rule)) continue; prule = (struct nft_rule_dp *)data; data += offsetof(struct nft_rule_dp, data); if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; size = 0; track.last = nft_expr_last(rule); nft_rule_for_each_expr(expr, last, rule) { track.cur = expr; if (nft_expr_reduce(&track, expr)) { expr = track.cur; continue; } if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary)) return -ENOMEM; memcpy(data + size, expr, expr->ops->size); size += expr->ops->size; } if (WARN_ON_ONCE(size >= 1 << 12)) return -ENOMEM; prule->handle = rule->handle; prule->dlen = size; prule->is_last = 0; data += size; size = 0; chain->blob_next->size += (unsigned long)(data - (void *)prule); } if (WARN_ON_ONCE(data > data_boundary)) return -ENOMEM; prule = (struct nft_rule_dp *)data; nft_last_rule(chain, prule); return 0; } static void nf_tables_commit_chain_prepare_cancel(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { struct nft_chain *chain = nft_trans_rule_chain(trans); kvfree(chain->blob_next); chain->blob_next = NULL; } } } static void __nf_tables_commit_chain_free_rules(struct rcu_head *h) { struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h); kvfree(l->blob); } static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob) { struct nft_rule_dp_last *last; /* last rule trailer is after end marker */ last = (void *)blob + sizeof(*blob) + blob->size; last->blob = blob; call_rcu(&last->h, __nf_tables_commit_chain_free_rules); } static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { struct nft_rule_blob *g0, *g1; bool next_genbit; next_genbit = nft_gencursor_next(net); g0 = rcu_dereference_protected(chain->blob_gen_0, lockdep_commit_lock_is_held(net)); g1 = rcu_dereference_protected(chain->blob_gen_1, lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ if (chain->blob_next == NULL) { /* chain had no change in last or next generation */ if (g0 == g1) return; /* * chain had no change in this generation; make sure next * one uses same rules as current generation. */ if (next_genbit) { rcu_assign_pointer(chain->blob_gen_1, g0); nf_tables_commit_chain_free_rules_old(g1); } else { rcu_assign_pointer(chain->blob_gen_0, g1); nf_tables_commit_chain_free_rules_old(g0); } return; } if (next_genbit) rcu_assign_pointer(chain->blob_gen_1, chain->blob_next); else rcu_assign_pointer(chain->blob_gen_0, chain->blob_next); chain->blob_next = NULL; if (g0 == g1) return; if (next_genbit) nf_tables_commit_chain_free_rules_old(g1); else nf_tables_commit_chain_free_rules_old(g0); } static void nft_obj_del(struct nft_object *obj) { rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); list_del_rcu(&obj->list); } void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead, nft_chain_ht_params)); list_del_rcu(&chain->list); } static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, struct nft_trans_gc *trans) { struct nft_elem_priv **priv = trans->priv; unsigned int i; for (i = 0; i < trans->count; i++) { nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]); nft_setelem_remove(ctx->net, trans->set, priv[i]); } } void nft_trans_gc_destroy(struct nft_trans_gc *trans) { nft_set_put(trans->set); put_net(trans->net); kfree(trans); } static void nft_trans_gc_trans_free(struct rcu_head *rcu) { struct nft_elem_priv *elem_priv; struct nft_trans_gc *trans; struct nft_ctx ctx = {}; unsigned int i; trans = container_of(rcu, struct nft_trans_gc, rcu); ctx.net = read_pnet(&trans->set->net); for (i = 0; i < trans->count; i++) { elem_priv = trans->priv[i]; if (!nft_setelem_is_catchall(trans->set, elem_priv)) atomic_dec(&trans->set->nelems); nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv); } nft_trans_gc_destroy(trans); } static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) { struct nftables_pernet *nft_net; struct nft_ctx ctx = {}; nft_net = nft_pernet(trans->net); mutex_lock(&nft_net->commit_mutex); /* Check for race with transaction, otherwise this batch refers to * stale objects that might not be there anymore. Skip transaction if * set has been destroyed from control plane transaction in case gc * worker loses race. */ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { mutex_unlock(&nft_net->commit_mutex); return false; } ctx.net = trans->net; ctx.table = trans->set->table; nft_trans_gc_setelem_remove(&ctx, trans); mutex_unlock(&nft_net->commit_mutex); return true; } static void nft_trans_gc_work(struct work_struct *work) { struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); if (!nft_trans_gc_work_done(trans)) { nft_trans_gc_destroy(trans); continue; } call_rcu(&trans->rcu, nft_trans_gc_trans_free); } } struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp) { struct net *net = read_pnet(&set->net); struct nft_trans_gc *trans; trans = kzalloc(sizeof(*trans), gfp); if (!trans) return NULL; trans->net = maybe_get_net(net); if (!trans->net) { kfree(trans); return NULL; } refcount_inc(&set->refs); trans->set = set; trans->seq = gc_seq; return trans; } void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) { trans->priv[trans->count++] = priv; } static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) { spin_lock(&nf_tables_gc_list_lock); list_add_tail(&trans->list, &nf_tables_gc_list); spin_unlock(&nf_tables_gc_list_lock); schedule_work(&trans_gc_work); } static int nft_trans_gc_space(struct nft_trans_gc *trans) { return NFT_TRANS_GC_BATCHCOUNT - trans->count; } struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { struct nft_set *set; if (nft_trans_gc_space(gc)) return gc; set = gc->set; nft_trans_gc_queue_work(gc); return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) { if (trans->count == 0) { nft_trans_gc_destroy(trans); return; } nft_trans_gc_queue_work(trans); } struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { struct nft_set *set; if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) { WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); if (trans->count == 0) { nft_trans_gc_destroy(trans); return; } call_rcu(&trans->rcu, nft_trans_gc_trans_free); } struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq) { struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; struct nft_set_ext *ext; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) continue; if (nft_set_elem_is_dead(ext)) goto dead_elem; nft_set_elem_dead(ext); dead_elem: gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); if (!gc) return NULL; nft_trans_gc_elem_add(gc, catchall->elem); } return gc; } struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { struct nft_set_elem_catchall *catchall, *next; u64 tstamp = nft_net_tstamp(gc->net); const struct nft_set *set = gc->set; struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!__nft_set_elem_expired(ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return NULL; elem_priv = catchall->elem; nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); nft_setelem_catchall_destroy(catchall); nft_trans_gc_elem_add(gc, elem_priv); } return gc; } static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); } } static void nf_tables_commit_release(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; /* all side effects have to be made visible. * For example, if a chain named 'foo' has been deleted, a * new transaction must not find it anymore. * * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); mutex_unlock(&nft_net->commit_mutex); return; } trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); get_net(trans->net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); schedule_work(&trans_destroy_work); mutex_unlock(&nft_net->commit_mutex); } static void nft_commit_notify(struct net *net, u32 portid) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) { if (!batch_skb) { new_batch: batch_skb = skb; len = NLMSG_GOODSIZE - skb->len; list_del(&skb->list); continue; } len -= skb->len; if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) { data = skb_put(batch_skb, skb->len); memcpy(data, skb->data, skb->len); list_del(&skb->list); kfree_skb(skb); continue; } nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, NFT_CB(batch_skb).report, GFP_KERNEL); goto new_batch; } if (batch_skb) { nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, NFT_CB(batch_skb).report, GFP_KERNEL); } WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static int nf_tables_commit_audit_alloc(struct list_head *adl, struct nft_table *table) { struct nft_audit_data *adp; list_for_each_entry(adp, adl, list) { if (adp->table == table) return 0; } adp = kzalloc(sizeof(*adp), GFP_KERNEL); if (!adp) return -ENOMEM; adp->table = table; list_add(&adp->list, adl); return 0; } static void nf_tables_commit_audit_free(struct list_head *adl) { struct nft_audit_data *adp, *adn; list_for_each_entry_safe(adp, adn, adl, list) { list_del(&adp->list); kfree(adp); } } /* nft audit emits the number of elements that get added/removed/updated, * so NEW/DELSETELEM needs to increment based on the total elem count. */ static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWSETELEM: case NFT_MSG_DELSETELEM: return nft_trans_container_elem(trans)->nelems; } return 1; } static void nf_tables_commit_audit_collect(struct list_head *adl, const struct nft_trans *trans, u32 op) { const struct nft_table *table = trans->table; struct nft_audit_data *adp; list_for_each_entry(adp, adl, list) { if (adp->table == table) goto found; } WARN_ONCE(1, "table=%s not expected in commit list", table->name); return; found: adp->entries += nf_tables_commit_audit_entrycount(trans); if (!adp->op || adp->op > op) adp->op = op; } #define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22) static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) { struct nft_audit_data *adp, *adn; char aubuf[AUNFTABLENAMELEN]; list_for_each_entry_safe(adp, adn, adl, list) { snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name, generation); audit_log_nfcfg(aubuf, adp->table->family, adp->entries, nft2audit_op[adp->op], GFP_KERNEL); list_del(&adp->list); kfree(adp); } } static void nft_set_commit_update(struct list_head *set_update_list) { struct nft_set *set, *next; list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); if (!set->ops->commit || set->dead) continue; set->ops->commit(set); } } static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) { unsigned int gc_seq; /* Bump gc counter, it becomes odd, this is the busy mark. */ gc_seq = READ_ONCE(nft_net->gc_seq); WRITE_ONCE(nft_net->gc_seq, ++gc_seq); return gc_seq; } static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) { WRITE_ONCE(nft_net->gc_seq, ++gc_seq); } static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); const struct nlmsghdr *nlh = nlmsg_hdr(skb); struct nft_trans_binding *trans_binding; struct nft_trans *trans, *next; unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; struct nft_ctx ctx; LIST_HEAD(adl); int err; if (list_empty(&nft_net->commit_list)) { mutex_unlock(&nft_net->commit_mutex); return 0; } nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL); list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) { trans = &trans_binding->nft_trans; switch (trans->msg_type) { case NFT_MSG_NEWSET: if (!nft_trans_set_update(trans) && nft_set_is_anonymous(nft_trans_set(trans)) && !nft_trans_set_bound(trans)) { pr_warn_once("nftables ruleset with unbound set\n"); return -EINVAL; } break; case NFT_MSG_NEWCHAIN: if (!nft_trans_chain_update(trans) && nft_chain_binding(nft_trans_chain(trans)) && !nft_trans_chain_bound(trans)) { pr_warn_once("nftables ruleset with unbound chain\n"); return -EINVAL; } break; default: WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type); break; } } /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) { nft_net->validate_state = NFT_VALIDATE_DO; return -EAGAIN; } err = nft_flow_rule_offload_commit(net); if (err < 0) return err; /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { struct nft_table *table = trans->table; int ret; ret = nf_tables_commit_audit_alloc(&adl, table); if (ret) { nf_tables_commit_chain_prepare_cancel(net); nf_tables_commit_audit_free(&adl); return ret; } if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { chain = nft_trans_rule_chain(trans); ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { nf_tables_commit_chain_prepare_cancel(net); nf_tables_commit_audit_free(&adl); return ret; } } } /* step 2. Make rules_gen_X visible to packet path */ list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_commit_chain(net, chain); } /* * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ base_seq = READ_ONCE(nft_net->base_seq); while (++base_seq == 0) ; WRITE_ONCE(nft_net->base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { struct nft_table *table = trans->table; nft_ctx_update(&ctx, trans); nf_tables_commit_audit_collect(&adl, trans, trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } if (table->flags & NFT_TABLE_F_DORMANT) nf_tables_table_disable(net, table); table->flags &= ~__NFT_TABLE_F_UPDATE; } else { nft_clear(net, table); } nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: list_del_rcu(&table->list); nf_tables_table_notify(&ctx, trans->msg_type); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { nft_chain_commit_update(nft_trans_container_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, &nft_trans_chain_hooks(trans)); list_splice(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); nft_clear(net, nft_trans_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, &nft_trans_chain_hooks(trans)); if (!(table->flags & NFT_TABLE_F_DORMANT)) { nft_netdev_unregister_hooks(net, &nft_trans_chain_hooks(trans), true); } } else { nft_chain_del(nft_trans_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL); nf_tables_unregister_hook(ctx.net, ctx.table, nft_trans_chain(trans)); } break; case NFT_MSG_NEWRULE: nft_clear(net, nft_trans_rule(trans)); nf_tables_rule_notify(&ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: list_del_rcu(&nft_trans_rule(trans)->list); nf_tables_rule_notify(&ctx, nft_trans_rule(trans), trans->msg_type); nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_COMMIT); if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { struct nft_set *set = nft_trans_set(trans); WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); if (nft_trans_set_size(trans)) WRITE_ONCE(set->size, nft_trans_set_size(trans)); } else { nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table * from the transaction. */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) nft_use_dec(&table->use); } nf_tables_set_notify(&ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); nft_trans_destroy(trans); break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: te = nft_trans_container_elem(trans); nft_trans_elems_add(&ctx, te); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, &set_update_list); } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); nft_trans_elems_remove(&ctx, te); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, &set_update_list); } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { nft_obj_commit_update(&ctx, trans); nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); } else { nft_clear(net, nft_trans_obj(trans)); nf_tables_obj_notify(&ctx, nft_trans_obj(trans), NFT_MSG_NEWOBJ); nft_trans_destroy(trans); } break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&ctx, nft_trans_obj(trans), trans->msg_type); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_trans_flowtable(trans)->data.flags = nft_trans_flowtable_flags(trans); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), NFT_MSG_NEWFLOWTABLE); list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { nft_clear(net, nft_trans_flowtable(trans)); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; } } nft_set_commit_update(&set_update_list); nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; nf_tables_commit_release(net); return 0; } static void nf_tables_module_autoload(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; LIST_HEAD(module_list); list_splice_init(&nft_net->module_list, &module_list); mutex_unlock(&nft_net->commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { request_module("%s", req->module); req->done = true; } mutex_lock(&nft_net->commit_mutex); list_splice(&module_list, &nft_net->module_list); } static void nf_tables_abort_release(struct nft_trans *trans) { struct nft_ctx ctx = { }; nft_ctx_update(&ctx, trans); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: nf_tables_table_destroy(trans->table); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) nft_hooks_destroy(&nft_trans_chain_hooks(trans)); else nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_NEWRULE: nf_tables_rule_destroy(&ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: nft_obj_destroy(&ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); else nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } kfree(trans); } static void nft_set_abort_update(struct list_head *set_update_list) { struct nft_set *set, *next; list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); if (!set->ops->abort) continue; set->ops->abort(set); } } static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_ctx ctx = { .net = net, }; int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { struct nft_table *table = trans->table; nft_ctx_update(&ctx, trans); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (!(table->flags & __NFT_TABLE_F_UPDATE)) { nft_trans_destroy(trans); break; } if (table->flags & __NFT_TABLE_F_WAS_DORMANT) { nf_tables_table_disable(net, table); table->flags |= NFT_TABLE_F_DORMANT; } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) { table->flags &= ~NFT_TABLE_F_DORMANT; } if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) { table->flags &= ~NFT_TABLE_F_OWNER; table->nlpid = 0; } table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { list_del_rcu(&table->list); } break; case NFT_MSG_DELTABLE: case NFT_MSG_DESTROYTABLE: nft_clear(trans->net, table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { if (!(table->flags & NFT_TABLE_F_DORMANT)) { nft_netdev_unregister_hooks(net, &nft_trans_chain_hooks(trans), true); } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } nft_use_dec_restore(&table->use); nft_chain_del(nft_trans_chain(trans)); nf_tables_unregister_hook(trans->net, table, nft_trans_chain(trans)); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { list_splice(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_chain(trans)); } nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: if (nft_trans_rule_bound(trans)) { nft_trans_destroy(trans); break; } nft_use_dec_restore(&nft_trans_rule_chain(trans)->use); list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: nft_use_inc_restore(&nft_trans_rule_chain(trans)->use); nft_clear(trans->net, nft_trans_rule(trans)); nft_rule_expr_activate(&ctx, nft_trans_rule(trans)); if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { nft_trans_destroy(trans); break; } nft_use_dec_restore(&table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_set(trans)); if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_activate(&ctx, nft_trans_set(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } te = nft_trans_container_elem(trans); if (!nft_trans_elems_new_abort(&ctx, te)) { nft_trans_destroy(trans); break; } if (te->set->ops->abort && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, &set_update_list); } break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); nft_trans_elems_destroy_abort(&ctx, te); if (te->set->ops->abort && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, &set_update_list); } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { nft_use_dec_restore(&table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); break; } } WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); nft_set_abort_update(&set_update_list); synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { nft_trans_list_del(trans); nf_tables_abort_release(trans); } return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = nft_pernet(net); unsigned int gc_seq; int ret; gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); /* module autoload needs to happen after GC sequence update because it * temporarily releases and grabs mutex again. */ if (action == NFNL_ABORT_AUTOLOAD) nf_tables_module_autoload(net); else nf_tables_module_autoload_cleanup(net); mutex_unlock(&nft_net->commit_mutex); return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { struct nftables_pernet *nft_net = nft_pernet(net); bool genid_ok; mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); /* else, commit mutex has to be released by commit or abort function */ return genid_ok; } static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, .cb_count = NFT_MSG_MAX, .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, .valid_genid = nf_tables_valid_genid, .owner = THIS_MODULE, }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type) { const struct nft_base_chain *basechain; if (nft_is_base_chain(chain)) { basechain = nft_base_chain(chain); if (basechain->type->type != type) return -EOPNOTSUPP; } return 0; } EXPORT_SYMBOL_GPL(nft_chain_validate_dependency); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags) { struct nft_base_chain *basechain; if (nft_is_base_chain(chain)) { basechain = nft_base_chain(chain); if ((1 << basechain->ops.hooknum) & hook_flags) return 0; return -EOPNOTSUPP; } return 0; } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * * @attr: netlink attribute to fetch value from * @max: maximum value to be stored in dest * @dest: pointer to the variable * * Parse, check and store a given u32 netlink attribute into variable. * This function returns -ERANGE if the value goes over maximum value. * Otherwise a 0 is returned and the attribute value is stored in the * destination variable. */ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) { u32 val; val = ntohl(nla_get_be32(attr)); if (val > max) return -ERANGE; *dest = val; return 0; } EXPORT_SYMBOL_GPL(nft_parse_u32_check); static int nft_parse_register(const struct nlattr *attr, u32 *preg) { unsigned int reg; reg = ntohl(nla_get_be32(attr)); switch (reg) { case NFT_REG_VERDICT...NFT_REG_4: *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE; break; case NFT_REG32_00...NFT_REG32_15: *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; break; default: return -ERANGE; } return 0; } /** * nft_dump_register - dump a register value to a netlink attribute * * @skb: socket buffer * @attr: attribute number * @reg: register number * * Construct a netlink attribute containing the register number. For * compatibility reasons, register numbers being a multiple of 4 are * translated to the corresponding 128 bit register numbers. */ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) { if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0) reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE); else reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00; return nla_put_be32(skb, attr, htonl(reg)); } EXPORT_SYMBOL_GPL(nft_dump_register); static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) return -EINVAL; if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data)) return -ERANGE; return 0; } int nft_parse_register_load(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *sreg, u32 len) { int err, invalid_reg; u32 reg, next_register; err = nft_parse_register(attr, ®); if (err < 0) return err; err = nft_validate_register_load(reg, len); if (err < 0) return err; next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg; /* Can't happen: nft_validate_register_load() should have failed */ if (WARN_ON_ONCE(next_register > NFT_REG32_NUM)) return -EINVAL; /* find first register that did not see an earlier store. */ invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg); /* invalid register within the range that we're loading from? */ if (invalid_reg < next_register) return -ENODATA; *sreg = reg; return 0; } EXPORT_SYMBOL_GPL(nft_parse_register_load); static void nft_saw_register_store(const struct nft_ctx *__ctx, int reg, unsigned int len) { unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE); struct nft_ctx *ctx = (struct nft_ctx *)__ctx; if (WARN_ON_ONCE(len == 0 || reg < 0)) return; bitmap_set(ctx->reg_inited, reg, registers); } static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, enum nft_data_types type, unsigned int len) { int err; switch (reg) { case NFT_REG_VERDICT: if (type != NFT_DATA_VERDICT) return -EINVAL; if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } break; default: if (type != NFT_DATA_VALUE) return -EINVAL; if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) return -EINVAL; if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data)) return -ERANGE; break; } nft_saw_register_store(ctx, reg, len); return 0; } int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len) { int err; u32 reg; err = nft_parse_register(attr, ®); if (err < 0) return err; err = nft_validate_register_store(ctx, reg, data, type, len); if (err < 0) return err; *dreg = reg; return 0; } EXPORT_SYMBOL_GPL(nft_parse_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { u8 genmask = nft_genmask_next(ctx->net); struct nlattr *tb[NFTA_VERDICT_MAX + 1]; struct nft_chain *chain; int err; err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy, NULL); if (err < 0) return err; if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; /* zero padding hole for memcmp */ memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: break; case NFT_JUMP: case NFT_GOTO: if (tb[NFTA_VERDICT_CHAIN]) { chain = nft_chain_lookup(ctx->net, ctx->table, tb[NFTA_VERDICT_CHAIN], genmask); } else if (tb[NFTA_VERDICT_CHAIN_ID]) { chain = nft_chain_lookup_byid(ctx->net, ctx->table, tb[NFTA_VERDICT_CHAIN_ID], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { return -EINVAL; } if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; if (nft_chain_is_bound(chain)) return -EINVAL; if (desc->flags & NFT_DATA_DESC_SETELEM && chain->flags & NFT_CHAIN_BINDING) return -EINVAL; if (!nft_use_inc(&chain->use)) return -EMFILE; data->verdict.chain = chain; break; default: return -EINVAL; } desc->len = sizeof(data->verdict); return 0; } static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; nft_use_dec(&chain->use); break; } } int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, type); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code))) goto nla_put_failure; switch (v->code) { case NFT_JUMP: case NFT_GOTO: if (nla_put_string(skb, NFTA_VERDICT_CHAIN, v->chain->name)) goto nla_put_failure; } nla_nest_end(skb, nest); return 0; nla_put_failure: return -1; } static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { unsigned int len; len = nla_len(nla); if (len == 0) return -EINVAL; if (len > desc->size) return -EOVERFLOW; if (desc->len) { if (len != desc->len) return -EINVAL; } else { desc->len = len; } nla_memcpy(data->data, nla, len); return 0; } static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, unsigned int len) { return nla_put(skb, NFTA_DATA_VALUE, len, data->data); } static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { [NFTA_DATA_VALUE] = { .type = NLA_BINARY }, [NFTA_DATA_VERDICT] = { .type = NLA_NESTED }, }; /** * nft_data_init - parse nf_tables data netlink attributes * * @ctx: context of the expression using the data * @data: destination struct nft_data * @desc: data description * @nla: netlink attribute containing data * * Parse the netlink data attributes and initialize a struct nft_data. * The type and length of data are returned in the data description. * * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; if (WARN_ON_ONCE(!desc->size)) return -EINVAL; err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; if (tb[NFTA_DATA_VALUE]) { if (desc->type != NFT_DATA_VALUE) return -EINVAL; err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) { if (desc->type != NFT_DATA_VERDICT) return -EINVAL; err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); } else { err = -EINVAL; } return err; } EXPORT_SYMBOL_GPL(nft_data_init); /** * nft_data_release - release a nft_data item * * @data: struct nft_data to release * @type: type of data * * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, * all others need to be released by calling this function. */ void nft_data_release(const struct nft_data *data, enum nft_data_types type) { if (type < NFT_DATA_VERDICT) return; switch (type) { case NFT_DATA_VERDICT: return nft_verdict_uninit(data); default: WARN_ON(1); } } EXPORT_SYMBOL_GPL(nft_data_release); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len) { struct nlattr *nest; int err; nest = nla_nest_start_noflag(skb, attr); if (nest == NULL) return -1; switch (type) { case NFT_DATA_VALUE: err = nft_value_dump(skb, data, len); break; case NFT_DATA_VERDICT: err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict); break; default: err = -EINVAL; WARN_ON(1); } nla_nest_end(skb, nest); return err; } EXPORT_SYMBOL_GPL(nft_data_dump); static void __nft_release_hook(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable; struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) __nft_unregister_flowtable_net_hooks(net, flowtable, &flowtable->hook_list, true); } static void __nft_release_hooks(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; __nft_release_hook(net, table); } } static void __nft_release_table(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable, *nf; struct nft_chain *chain, *nc; struct nft_object *obj, *ne; struct nft_rule *rule, *nr; struct nft_set *set, *ns; struct nft_ctx ctx = { .net = net, .family = NFPROTO_NETDEV, }; ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { if (nft_chain_binding(chain)) continue; ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); nft_use_dec(&chain->use); nf_tables_rule_release(&ctx, rule); } } list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { list_del(&flowtable->list); nft_use_dec(&table->use); nf_tables_flowtable_destroy(flowtable); } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); nft_use_dec(&table->use); if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(&ctx, set); nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); nft_use_dec(&table->use); nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { nft_chain_del(chain); nft_use_dec(&table->use); nf_tables_chain_destroy(chain); } nf_tables_table_destroy(table); } static void __nft_release_tables(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table, *nt; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; list_del(&table->list); __nft_release_table(net, table); } } static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct nft_table *table, *to_delete[8]; struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; struct net *net = n->net; unsigned int deleted; bool restart = false; unsigned int gc_seq; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; nft_net = nft_pernet(net); deleted = 0; mutex_lock(&nft_net->commit_mutex); gc_seq = nft_gc_seq_begin(nft_net); nf_tables_trans_destroy_flush_work(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { if (table->flags & NFT_TABLE_F_PERSIST) { table->flags &= ~NFT_TABLE_F_OWNER; continue; } __nft_release_hook(net, table); list_del_rcu(&table->list); to_delete[deleted++] = table; if (deleted >= ARRAY_SIZE(to_delete)) break; } } if (deleted) { restart = deleted >= ARRAY_SIZE(to_delete); synchronize_rcu(); while (deleted) __nft_release_table(net, to_delete[--deleted]); if (restart) goto again; } nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_nl_notifier = { .notifier_call = nft_rcv_nl_event, }; static int __net_init nf_tables_init_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); INIT_LIST_HEAD(&nft_net->commit_set_list); INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; return 0; } static void __net_exit nf_tables_pre_exit_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); __nft_release_hooks(net); mutex_unlock(&nft_net->commit_mutex); } static void __net_exit nf_tables_exit_net(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); unsigned int gc_seq; mutex_lock(&nft_net->commit_mutex); gc_seq = nft_gc_seq_begin(nft_net); WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); if (!list_empty(&nft_net->module_list)) nf_tables_module_autoload_cleanup(net); __nft_release_tables(net); nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static void nf_tables_exit_batch(struct list_head *net_exit_list) { flush_work(&trans_gc_work); } static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; static int __init nf_tables_module_init(void) { int err; BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0); BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0); err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; err = nft_chain_filter_init(); if (err < 0) goto err_chain_filter; err = nf_tables_core_module_init(); if (err < 0) goto err_core_module; err = register_netdevice_notifier(&nf_tables_flowtable_notifier); if (err < 0) goto err_netdev_notifier; err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params); if (err < 0) goto err_rht_objname; err = nft_offload_init(); if (err < 0) goto err_offload; err = netlink_register_notifier(&nft_nl_notifier); if (err < 0) goto err_netlink_notifier; /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) goto err_nfnl_subsys; nft_chain_route_init(); return err; err_nfnl_subsys: netlink_unregister_notifier(&nft_nl_notifier); err_netlink_notifier: nft_offload_exit(); err_offload: rhltable_destroy(&nft_objname_ht); err_rht_objname: unregister_netdevice_notifier(&nf_tables_flowtable_notifier); err_netdev_notifier: nf_tables_core_module_exit(); err_core_module: nft_chain_filter_fini(); err_chain_filter: unregister_pernet_subsys(&nf_tables_net_ops); return err; } static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); netlink_unregister_notifier(&nft_nl_notifier); nft_offload_exit(); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); } module_init(nf_tables_module_init); module_exit(nf_tables_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Framework for packet filtering and classification"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); |
56 59 58 58 60 59 59 63 63 62 20 19 19 59 100 99 101 100 2 2 57 48 48 4 57 4 17 16 1 17 60 59 60 60 51 9 57 57 18 18 18 18 18 18 18 18 18 57 57 57 57 57 57 57 57 57 57 50 7 57 57 57 56 56 57 57 57 57 57 57 57 57 57 48 2 47 18 18 18 18 18 48 3 45 48 48 49 49 49 45 4 2 2 49 48 60 1 59 58 276 5 1 1 1 1 1 2 6 1 1 1 1 1 1 2 2 14 9 9 9 8 60 2 50 9 1 8 3 1 2 1 1 3 1 2 1 4 1 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <linux/cpu.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/uaccess.h> #include <clocksource/arm_arch_timer.h> #include <asm/arch_timer.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> #include <kvm/arm_vgic.h> #include <kvm/arm_arch_timer.h> #include "trace.h" static struct timecounter *timecounter; static unsigned int host_vtimer_irq; static unsigned int host_ptimer_irq; static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); static const u8 default_ppi[] = { [TIMER_PTIMER] = 30, [TIMER_VTIMER] = 27, [TIMER_HPTIMER] = 26, [TIMER_HVTIMER] = 28, }; static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx); static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg, u64 val); static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg); static bool kvm_arch_timer_get_input_level(int vintid); static struct irq_ops arch_timer_irq_ops = { .get_input_level = kvm_arch_timer_get_input_level, }; static int nr_timers(struct kvm_vcpu *vcpu) { if (!vcpu_has_nv(vcpu)) return NR_KVM_EL0_TIMERS; return NR_KVM_TIMERS; } u32 timer_get_ctl(struct arch_timer_context *ctxt) { struct kvm_vcpu *vcpu = ctxt->vcpu; switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); case TIMER_HVTIMER: return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2); case TIMER_HPTIMER: return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2); default: WARN_ON(1); return 0; } } u64 timer_get_cval(struct arch_timer_context *ctxt) { struct kvm_vcpu *vcpu = ctxt->vcpu; switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); case TIMER_PTIMER: return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); case TIMER_HVTIMER: return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2); case TIMER_HPTIMER: return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); default: WARN_ON(1); return 0; } } static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { struct kvm_vcpu *vcpu = ctxt->vcpu; switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl; break; case TIMER_PTIMER: __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl; break; case TIMER_HVTIMER: __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl; break; case TIMER_HPTIMER: __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl; break; default: WARN_ON(1); } } static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) { struct kvm_vcpu *vcpu = ctxt->vcpu; switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval; break; case TIMER_PTIMER: __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval; break; case TIMER_HVTIMER: __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval; break; case TIMER_HPTIMER: __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval; break; default: WARN_ON(1); } } static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) { if (!ctxt->offset.vm_offset) { WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); return; } WRITE_ONCE(*ctxt->offset.vm_offset, offset); } u64 kvm_phys_timer_read(void) { return timecounter->cc->read(timecounter->cc); } void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) { if (vcpu_has_nv(vcpu)) { if (is_hyp_ctxt(vcpu)) { map->direct_vtimer = vcpu_hvtimer(vcpu); map->direct_ptimer = vcpu_hptimer(vcpu); map->emul_vtimer = vcpu_vtimer(vcpu); map->emul_ptimer = vcpu_ptimer(vcpu); } else { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = vcpu_ptimer(vcpu); map->emul_vtimer = vcpu_hvtimer(vcpu); map->emul_ptimer = vcpu_hptimer(vcpu); } } else if (has_vhe()) { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = vcpu_ptimer(vcpu); map->emul_vtimer = NULL; map->emul_ptimer = NULL; } else { map->direct_vtimer = vcpu_vtimer(vcpu); map->direct_ptimer = NULL; map->emul_vtimer = NULL; map->emul_ptimer = vcpu_ptimer(vcpu); } trace_kvm_get_timer_map(vcpu->vcpu_id, map); } static inline bool userspace_irqchip(struct kvm *kvm) { return unlikely(!irqchip_in_kernel(kvm)); } static void soft_timer_start(struct hrtimer *hrt, u64 ns) { hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), HRTIMER_MODE_ABS_HARD); } static void soft_timer_cancel(struct hrtimer *hrt) { hrtimer_cancel(hrt); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; struct arch_timer_context *ctx; struct timer_map map; /* * We may see a timer interrupt after vcpu_put() has been called which * sets the CPU's vcpu pointer to NULL, because even though the timer * has been disabled in timer_save_state(), the hardware interrupt * signal may not have been retired from the interrupt controller yet. */ if (!vcpu) return IRQ_HANDLED; get_timer_map(vcpu, &map); if (irq == host_vtimer_irq) ctx = map.direct_vtimer; else ctx = map.direct_ptimer; if (kvm_timer_should_fire(ctx)) kvm_timer_update_irq(vcpu, true, ctx); if (userspace_irqchip(vcpu->kvm) && !static_branch_unlikely(&has_gic_active_state)) disable_percpu_irq(host_vtimer_irq); return IRQ_HANDLED; } static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx, u64 val) { u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); if (now < val) { u64 ns; ns = cyclecounter_cyc2ns(timecounter->cc, val - now, timecounter->mask, &timer_ctx->ns_frac); return ns; } return 0; } static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx)); } static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { WARN_ON(timer_ctx && timer_ctx->loaded); return timer_ctx && ((timer_get_ctl(timer_ctx) & (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE); } static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) { return (cpus_have_final_cap(ARM64_HAS_WFXT) && vcpu_get_flag(vcpu, IN_WFIT)); } static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) { u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); struct arch_timer_context *ctx; ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu); return kvm_counter_compute_delta(ctx, val); } /* * Returns the earliest expiration time in ns among guest timers. * Note that it will return 0 if none of timers can fire. */ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) { u64 min_delta = ULLONG_MAX; int i; for (i = 0; i < nr_timers(vcpu); i++) { struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; WARN(ctx->loaded, "timer %d loaded\n", i); if (kvm_timer_irq_can_fire(ctx)) min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); } if (vcpu_has_wfit_active(vcpu)) min_delta = min(min_delta, wfit_delay_ns(vcpu)); /* If none of timers can fire, then return 0 */ if (min_delta == ULLONG_MAX) return 0; return min_delta; } static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) { struct arch_timer_cpu *timer; struct kvm_vcpu *vcpu; u64 ns; timer = container_of(hrt, struct arch_timer_cpu, bg_timer); vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); /* * Check that the timer has really expired from the guest's * PoV (NTP on the host may have forced it to expire * early). If we should have slept longer, restart it. */ ns = kvm_timer_earliest_exp(vcpu); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; } kvm_vcpu_wake_up(vcpu); return HRTIMER_NORESTART; } static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) { struct arch_timer_context *ctx; struct kvm_vcpu *vcpu; u64 ns; ctx = container_of(hrt, struct arch_timer_context, hrtimer); vcpu = ctx->vcpu; trace_kvm_timer_hrtimer_expire(ctx); /* * Check that the timer has really expired from the guest's * PoV (NTP on the host may have forced it to expire * early). If not ready, schedule for a later time. */ ns = kvm_timer_compute_delta(ctx); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; } kvm_timer_update_irq(vcpu, true, ctx); return HRTIMER_NORESTART; } static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { enum kvm_arch_timers index; u64 cval, now; if (!timer_ctx) return false; index = arch_timer_ctx_index(timer_ctx); if (timer_ctx->loaded) { u32 cnt_ctl = 0; switch (index) { case TIMER_VTIMER: case TIMER_HVTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL); break; case TIMER_PTIMER: case TIMER_HPTIMER: cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL); break; case NR_KVM_TIMERS: /* GCC is braindead */ cnt_ctl = 0; break; } return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); } if (!kvm_timer_irq_can_fire(timer_ctx)) return false; cval = timer_get_cval(timer_ctx); now = kvm_phys_timer_read() - timer_get_offset(timer_ctx); return cval <= now; } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0; } /* * Reflect the timer output level into the kvm_run structure */ void kvm_timer_update_run(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); struct kvm_sync_regs *regs = &vcpu->run->s.regs; /* Populate the device bitmap with the timer states */ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); if (kvm_timer_should_fire(vtimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; if (kvm_timer_should_fire(ptimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) { /* * Paper over NV2 brokenness by publishing the interrupt status * bit. This still results in a poor quality of emulation (guest * writes will have no effect until the next exit). * * But hey, it's fast, right? */ if (is_hyp_ctxt(ctx->vcpu) && (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { unsigned long val = timer_get_ctl(ctx); __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); timer_set_ctl(ctx, val); } } static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); if (userspace_irqchip(vcpu->kvm)) return; kvm_vgic_inject_irq(vcpu->kvm, vcpu, timer_irq(timer_ctx), timer_ctx->irq.level, timer_ctx); } /* Only called for a fully emulated timer */ static void timer_emulate(struct arch_timer_context *ctx) { bool should_fire = kvm_timer_should_fire(ctx); trace_kvm_timer_emulate(ctx, should_fire); if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); kvm_timer_update_status(ctx, should_fire); /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ if (should_fire || !kvm_timer_irq_can_fire(ctx)) return; soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } static void set_cntvoff(u64 cntvoff) { kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff); } static void set_cntpoff(u64 cntpoff) { if (has_cntpoff()) write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2); } static void timer_save_state(struct arch_timer_context *ctx) { struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; if (!timer->enabled) return; local_irq_save(flags); if (!ctx->loaded) goto out; switch (index) { u64 cval; case TIMER_VTIMER: case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); cval = read_sysreg_el0(SYS_CNTV_CVAL); if (has_broken_cntvoff()) cval -= timer_get_offset(ctx); timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); isb(); /* * The kernel may decide to run userspace after * calling vcpu_put, so we reset cntvoff to 0 to * ensure a consistent read between user accesses to * the virtual counter and kernel access to the * physical counter of non-VHE case. * * For VHE, the virtual counter uses a fixed virtual * offset of zero, so no need to zero CNTVOFF_EL2 * register, but this is actually useful when switching * between EL1/vEL2 with NV. * * Do it unconditionally, as this is either unavoidable * or dirt cheap. */ set_cntvoff(0); break; case TIMER_PTIMER: case TIMER_HPTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); cval = read_sysreg_el0(SYS_CNTP_CVAL); cval -= timer_get_offset(ctx); timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTP_CTL); isb(); set_cntpoff(0); break; case NR_KVM_TIMERS: BUG(); } trace_kvm_timer_save_state(ctx); ctx->loaded = false; out: local_irq_restore(flags); } /* * Schedule the background timer before calling kvm_vcpu_halt, so that this * thread is removed from its waitqueue and made runnable when there's a timer * interrupt to handle. */ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; get_timer_map(vcpu, &map); /* * If no timers are capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ if (!kvm_timer_irq_can_fire(map.direct_vtimer) && !kvm_timer_irq_can_fire(map.direct_ptimer) && !kvm_timer_irq_can_fire(map.emul_vtimer) && !kvm_timer_irq_can_fire(map.emul_ptimer) && !vcpu_has_wfit_active(vcpu)) return; /* * At least one guest time will expire. Schedule a background timer. * Set the earliest expiration time among the guest timers. */ soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); } static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } static void timer_restore_state(struct arch_timer_context *ctx) { struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; if (!timer->enabled) return; local_irq_save(flags); if (ctx->loaded) goto out; switch (index) { u64 cval, offset; case TIMER_VTIMER: case TIMER_HVTIMER: cval = timer_get_cval(ctx); offset = timer_get_offset(ctx); if (has_broken_cntvoff()) { set_cntvoff(0); cval += offset; } else { set_cntvoff(offset); } write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; case TIMER_PTIMER: case TIMER_HPTIMER: cval = timer_get_cval(ctx); offset = timer_get_offset(ctx); set_cntpoff(offset); cval += offset; write_sysreg_el0(cval, SYS_CNTP_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); break; case NR_KVM_TIMERS: BUG(); } trace_kvm_timer_restore_state(ctx); ctx->loaded = true; out: local_irq_restore(flags); } static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); WARN_ON(r); } static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { struct kvm_vcpu *vcpu = ctx->vcpu; bool phys_active = false; /* * Update the timer output so that it is likely to match the * state we're about to restore. If the timer expires between * this point and the register restoration, we'll take the * interrupt anyway. */ kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); phys_active |= ctx->irq.level; set_timer_irq_phys_active(ctx, phys_active); } static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); /* * Update the timer output so that it is likely to match the * state we're about to restore. If the timer expires between * this point and the register restoration, we'll take the * interrupt anyway. */ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); /* * When using a userspace irqchip with the architected timers and a * host interrupt controller that doesn't support an active state, we * must still prevent continuously exiting from the guest, and * therefore mask the physical interrupt by disabling it on the host * interrupt controller when the virtual level is high, such that the * guest can make forward progress. Once we detect the output level * being de-asserted, we unmask the interrupt again so that we exit * from the guest when the timer fires. */ if (vtimer->irq.level) disable_percpu_irq(host_vtimer_irq); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } /* If _pred is true, set bit in _set, otherwise set it in _clr */ #define assign_clear_set_bit(_pred, _bit, _clr, _set) \ do { \ if (_pred) \ (_set) |= (_bit); \ else \ (_clr) |= (_bit); \ } while (0) static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, struct timer_map *map) { int hw, ret; if (!irqchip_in_kernel(vcpu->kvm)) return; /* * We only ever unmap the vtimer irq on a VHE system that runs nested * virtualization, in which case we have both a valid emul_vtimer, * emul_ptimer, direct_vtimer, and direct_ptimer. * * Since this is called from kvm_timer_vcpu_load(), a change between * vEL2 and vEL1/0 will have just happened, and the timer_map will * represent this, and therefore we switch the emul/direct mappings * below. */ hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer)); if (hw < 0) { kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer)); kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer)); ret = kvm_vgic_map_phys_irq(vcpu, map->direct_vtimer->host_timer_irq, timer_irq(map->direct_vtimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); ret = kvm_vgic_map_phys_irq(vcpu, map->direct_ptimer->host_timer_irq, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); } } static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) { bool tvt, tpt, tvc, tpc, tvt02, tpt02; u64 clr, set; /* * No trapping gets configured here with nVHE. See * __timer_enable_traps(), which is where the stuff happens. */ if (!has_vhe()) return; /* * Our default policy is not to trap anything. As we progress * within this function, reality kicks in and we start adding * traps based on emulation requirements. */ tvt = tpt = tvc = tpc = false; tvt02 = tpt02 = false; /* * NV2 badly breaks the timer semantics by redirecting accesses to * the EL1 timer state to memory, so let's call ECV to the rescue if * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. * * The treatment slightly varies depending whether we run a nVHE or * VHE guest: nVHE will use the _EL0 registers directly, while VHE * will use the _EL02 accessors. This translates in different trap * bits. * * None of the trapping is required when running in non-HYP context, * unless required by the L1 hypervisor settings once we advertise * ECV+NV in the guest, or that we need trapping for other reasons. */ if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { if (vcpu_el2_e2h_is_set(vcpu)) tvt02 = tpt02 = true; else tvt = tpt = true; } /* * We have two possibility to deal with a physical offset: * * - Either we have CNTPOFF (yay!) or the offset is 0: * we let the guest freely access the HW * * - or neither of these condition apply: * we trap accesses to the HW, but still use it * after correcting the physical offset */ if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) tpt = tpc = true; /* * For the poor sods that could not correctly substract one value * from another, trap the full virtual timer and counter. */ if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) tvt = tvc = true; /* * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set * above. * Implementation choices: we do not support NV when E2H=0 in the * guest, and we don't support configuration where E2H is writable * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but * not both). This simplifies the handling of the EL1NV* bits. */ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); /* Use the VHE format for mental sanity */ if (!vcpu_el2_e2h_is_set(vcpu)) val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); tpt02 |= (val & CNTHCTL_EL1NVPCT); tvt02 |= (val & CNTHCTL_EL1NVVCT); } /* * Now that we have collected our requirements, compute the * trap and enable bits. */ set = 0; clr = 0; assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ sysreg_clear_set(cnthctl_el2, clr, set); } void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; if (unlikely(!timer->enabled)) return; get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { if (vcpu_has_nv(vcpu)) kvm_timer_vcpu_load_nested_switch(vcpu, &map); kvm_timer_vcpu_load_gic(map.direct_vtimer); if (map.direct_ptimer) kvm_timer_vcpu_load_gic(map.direct_ptimer); } else { kvm_timer_vcpu_load_nogic(vcpu); } kvm_timer_unblocking(vcpu); timer_restore_state(map.direct_vtimer); if (map.direct_ptimer) timer_restore_state(map.direct_ptimer); if (map.emul_vtimer) timer_emulate(map.emul_vtimer); if (map.emul_ptimer) timer_emulate(map.emul_ptimer); timer_set_traps(vcpu, &map); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); struct kvm_sync_regs *sregs = &vcpu->run->s.regs; bool vlevel, plevel; if (likely(irqchip_in_kernel(vcpu->kvm))) return false; vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; return kvm_timer_should_fire(vtimer) != vlevel || kvm_timer_should_fire(ptimer) != plevel; } void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; if (unlikely(!timer->enabled)) return; get_timer_map(vcpu, &map); timer_save_state(map.direct_vtimer); if (map.direct_ptimer) timer_save_state(map.direct_ptimer); /* * Cancel soft timer emulation, because the only case where we * need it after a vcpu_put is in the context of a sleeping VCPU, and * in that case we already factor in the deadline for the physical * timer when scheduling the bg_timer. * * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ if (map.emul_vtimer) soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); if (kvm_vcpu_is_blocking(vcpu)) kvm_timer_blocking(vcpu); } void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) { /* * When NV2 is on, guest hypervisors have their EL1 timer register * accesses redirected to the VNCR page. Any guest action taken on * the timer is postponed until the next exit, leading to a very * poor quality of emulation. * * This is an unmitigated disaster, only papered over by FEAT_ECV, * which allows trapping of the timer registers even with NV2. * Still, this is still worse than FEAT_NV on its own. Meh. */ if (!cpus_have_final_cap(ARM64_HAS_ECV)) { /* * For a VHE guest hypervisor, the EL2 state is directly * stored in the host EL1 timers, while the emulated EL1 * state is stored in the VNCR page. The latter could have * been updated behind our back, and we must reset the * emulation of the timers. * * A non-VHE guest hypervisor doesn't have any direct access * to its timers: the EL2 registers trap despite being * notionally direct (we use the EL1 HW, as for VHE), while * the EL1 registers access memory. * * In both cases, process the emulated timers on each guest * exit. Boo. */ struct timer_map map; get_timer_map(vcpu, &map); soft_timer_cancel(&map.emul_vtimer->hrtimer); soft_timer_cancel(&map.emul_ptimer->hrtimer); timer_emulate(map.emul_vtimer); timer_emulate(map.emul_ptimer); } } /* * With a userspace irqchip we have to check if the guest de-asserted the * timer and if so, unmask the timer irq signal on the host interrupt * controller to ensure that we see future timer signals. */ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); if (!kvm_timer_should_fire(vtimer)) { kvm_timer_update_irq(vcpu, false, vtimer); if (static_branch_likely(&has_gic_active_state)) set_timer_irq_phys_active(vtimer, false); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } } void kvm_timer_sync_user(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); if (unlikely(!timer->enabled)) return; if (unlikely(!irqchip_in_kernel(vcpu->kvm))) unmask_vtimer_irq_user(vcpu); } void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; get_timer_map(vcpu, &map); /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 * and to 0 for ARMv7. We provide an implementation that always * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ for (int i = 0; i < nr_timers(vcpu); i++) timer_set_ctl(vcpu_get_timer(vcpu, i), 0); /* * A vcpu running at EL2 is in charge of the offset applied to * the virtual timer, so use the physical VM offset, and point * the vcpu offset to CNTVOFF_EL2. */ if (vcpu_has_nv(vcpu)) { struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; } if (timer->enabled) { for (int i = 0; i < nr_timers(vcpu); i++) kvm_timer_update_irq(vcpu, false, vcpu_get_timer(vcpu, i)); if (irqchip_in_kernel(vcpu->kvm)) { kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer)); if (map.direct_ptimer) kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer)); } } if (map.emul_vtimer) soft_timer_cancel(&map.emul_vtimer->hrtimer); if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); } static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) { struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); struct kvm *kvm = vcpu->kvm; ctxt->vcpu = vcpu; if (timerid == TIMER_VTIMER) ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ctxt->hrtimer.function = kvm_hrtimer_expire; switch (timerid) { case TIMER_PTIMER: case TIMER_HPTIMER: ctxt->host_timer_irq = host_ptimer_irq; break; case TIMER_VTIMER: case TIMER_HVTIMER: ctxt->host_timer_irq = host_vtimer_irq; break; } } void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); for (int i = 0; i < NR_KVM_TIMERS; i++) timer_context_init(vcpu, i); /* Synchronize offsets across timers of a VM if not already provided */ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); timer_set_offset(vcpu_ptimer(vcpu), 0); } hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); timer->bg_timer.function = kvm_bg_timer_expire; } void kvm_timer_init_vm(struct kvm *kvm) { for (int i = 0; i < NR_KVM_TIMERS; i++) kvm->arch.timer_data.ppi[i] = default_ppi[i]; } void kvm_timer_cpu_up(void) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); if (host_ptimer_irq) enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); } void kvm_timer_cpu_down(void) { disable_percpu_irq(host_vtimer_irq); if (host_ptimer_irq) disable_percpu_irq(host_ptimer_irq); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { struct arch_timer_context *timer; switch (regid) { case KVM_REG_ARM_TIMER_CTL: timer = vcpu_vtimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { timer = vcpu_vtimer(vcpu); timer_set_offset(timer, kvm_phys_timer_read() - value); } break; case KVM_REG_ARM_TIMER_CVAL: timer = vcpu_vtimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CNT: if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { timer = vcpu_ptimer(vcpu); timer_set_offset(timer, kvm_phys_timer_read() - value); } break; case KVM_REG_ARM_PTIMER_CVAL: timer = vcpu_ptimer(vcpu); kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; default: return -1; } return 0; } static u64 read_timer_ctl(struct arch_timer_context *timer) { /* * Set ISTATUS bit if it's expired. * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit * regardless of ENABLE bit for our implementation convenience. */ u32 ctl = timer_get_ctl(timer); if (!kvm_timer_compute_delta(timer)) ctl |= ARCH_TIMER_CTRL_IT_STAT; return ctl; } u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_TIMER_CTL: return kvm_arm_timer_read(vcpu, vcpu_vtimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_TIMER_CNT: return kvm_arm_timer_read(vcpu, vcpu_vtimer(vcpu), TIMER_REG_CNT); case KVM_REG_ARM_TIMER_CVAL: return kvm_arm_timer_read(vcpu, vcpu_vtimer(vcpu), TIMER_REG_CVAL); case KVM_REG_ARM_PTIMER_CTL: return kvm_arm_timer_read(vcpu, vcpu_ptimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_PTIMER_CNT: return kvm_arm_timer_read(vcpu, vcpu_ptimer(vcpu), TIMER_REG_CNT); case KVM_REG_ARM_PTIMER_CVAL: return kvm_arm_timer_read(vcpu, vcpu_ptimer(vcpu), TIMER_REG_CVAL); } return (u64)-1; } static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg) { u64 val; switch (treg) { case TIMER_REG_TVAL: val = timer_get_cval(timer) - kvm_phys_timer_read() + timer_get_offset(timer); val = lower_32_bits(val); break; case TIMER_REG_CTL: val = read_timer_ctl(timer); break; case TIMER_REG_CVAL: val = timer_get_cval(timer); break; case TIMER_REG_CNT: val = kvm_phys_timer_read() - timer_get_offset(timer); break; case TIMER_REG_VOFF: val = *timer->offset.vcpu_offset; break; default: BUG(); } return val; } u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg) { struct arch_timer_context *timer; struct timer_map map; u64 val; get_timer_map(vcpu, &map); timer = vcpu_get_timer(vcpu, tmr); if (timer == map.emul_vtimer || timer == map.emul_ptimer) return kvm_arm_timer_read(vcpu, timer, treg); preempt_disable(); timer_save_state(timer); val = kvm_arm_timer_read(vcpu, timer, treg); timer_restore_state(timer); preempt_enable(); return val; } static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, struct arch_timer_context *timer, enum kvm_arch_timer_regs treg, u64 val) { switch (treg) { case TIMER_REG_TVAL: timer_set_cval(timer, kvm_phys_timer_read() - timer_get_offset(timer) + (s32)val); break; case TIMER_REG_CTL: timer_set_ctl(timer, val & ~ARCH_TIMER_CTRL_IT_STAT); break; case TIMER_REG_CVAL: timer_set_cval(timer, val); break; case TIMER_REG_VOFF: *timer->offset.vcpu_offset = val; break; default: BUG(); } } void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg, u64 val) { struct arch_timer_context *timer; struct timer_map map; get_timer_map(vcpu, &map); timer = vcpu_get_timer(vcpu, tmr); if (timer == map.emul_vtimer || timer == map.emul_ptimer) { soft_timer_cancel(&timer->hrtimer); kvm_arm_timer_write(vcpu, timer, treg, val); timer_emulate(timer); } else { preempt_disable(); timer_save_state(timer); kvm_arm_timer_write(vcpu, timer, treg, val); timer_restore_state(timer); preempt_enable(); } } static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { if (vcpu) irqd_set_forwarded_to_vcpu(d); else irqd_clr_forwarded_to_vcpu(d); return 0; } static int timer_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool val) { if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d)) return irq_chip_set_parent_state(d, which, val); if (val) irq_chip_mask_parent(d); else irq_chip_unmask_parent(d); return 0; } static void timer_irq_eoi(struct irq_data *d) { if (!irqd_is_forwarded_to_vcpu(d)) irq_chip_eoi_parent(d); } static void timer_irq_ack(struct irq_data *d) { d = d->parent_data; if (d->chip->irq_ack) d->chip->irq_ack(d); } static struct irq_chip timer_chip = { .name = "KVM", .irq_ack = timer_irq_ack, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_eoi = timer_irq_eoi, .irq_set_type = irq_chip_set_type_parent, .irq_set_vcpu_affinity = timer_irq_set_vcpu_affinity, .irq_set_irqchip_state = timer_irq_set_irqchip_state, }; static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { irq_hw_number_t hwirq = (uintptr_t)arg; return irq_domain_set_hwirq_and_chip(domain, virq, hwirq, &timer_chip, NULL); } static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { } static const struct irq_domain_ops timer_domain_ops = { .alloc = timer_irq_domain_alloc, .free = timer_irq_domain_free, }; static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags) { *flags = irq_get_trigger_type(virq); if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) { kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n", virq); *flags = IRQF_TRIGGER_LOW; } } static int kvm_irq_init(struct arch_timer_kvm_info *info) { struct irq_domain *domain = NULL; if (info->virtual_irq <= 0) { kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", info->virtual_irq); return -ENODEV; } host_vtimer_irq = info->virtual_irq; kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags); if (kvm_vgic_global_state.no_hw_deactivation) { struct fwnode_handle *fwnode; struct irq_data *data; fwnode = irq_domain_alloc_named_fwnode("kvm-timer"); if (!fwnode) return -ENOMEM; /* Assume both vtimer and ptimer in the same parent */ data = irq_get_irq_data(host_vtimer_irq); domain = irq_domain_create_hierarchy(data->domain, 0, NR_KVM_TIMERS, fwnode, &timer_domain_ops, NULL); if (!domain) { irq_domain_free_fwnode(fwnode); return -ENOMEM; } arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE; WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq, (void *)TIMER_VTIMER)); } if (info->physical_irq > 0) { host_ptimer_irq = info->physical_irq; kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags); if (domain) WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq, (void *)TIMER_PTIMER)); } return 0; } static void kvm_timer_handle_errata(void) { u64 mmfr0, mmfr1, mmfr4; /* * CNTVOFF_EL2 is broken on some implementations. For those, we trap * all virtual timer/counter accesses, requiring FEAT_ECV. * * However, a hypervisor supporting nesting is likely to mitigate the * erratum at L0, and not require other levels to mitigate it (which * would otherwise be a terrible performance sink due to trap * amplification). * * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, * and that NV is likely not to (because of limitations of the * architecture), only enable the workaround when FEAT_VHE and * FEAT_E2H0 are both detected. Time will tell if this actually holds. */ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && (has_vhe() || has_hvhe()) && cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { static_branch_enable(&broken_cntvoff_key); kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); } } int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; info = arch_timer_get_kvm_info(); timecounter = &info->timecounter; if (!timecounter->cc) { kvm_err("kvm_arch_timer: uninitialized timecounter\n"); return -ENODEV; } err = kvm_irq_init(info); if (err) return err; /* First, do the virtual EL1 timer irq */ err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, "kvm guest vtimer", kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", host_vtimer_irq, err); return err; } if (has_gic) { err = irq_set_vcpu_affinity(host_vtimer_irq, kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); goto out_free_vtimer_irq; } static_branch_enable(&has_gic_active_state); } kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); /* Now let's do the physical EL1 timer irq */ if (info->physical_irq > 0) { err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, "kvm guest ptimer", kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", host_ptimer_irq, err); goto out_free_vtimer_irq; } if (has_gic) { err = irq_set_vcpu_affinity(host_ptimer_irq, kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); goto out_free_ptimer_irq; } } kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); } else if (has_vhe()) { kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", info->physical_irq); err = -ENODEV; goto out_free_vtimer_irq; } kvm_timer_handle_errata(); return 0; out_free_ptimer_irq: if (info->physical_irq > 0) free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus()); out_free_vtimer_irq: free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); return err; } void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); soft_timer_cancel(&timer->bg_timer); } static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) { u32 ppis = 0; bool valid; mutex_lock(&vcpu->kvm->arch.config_lock); for (int i = 0; i < nr_timers(vcpu); i++) { struct arch_timer_context *ctx; int irq; ctx = vcpu_get_timer(vcpu, i); irq = timer_irq(ctx); if (kvm_vgic_set_owner(vcpu, irq, ctx)) break; /* * We know by construction that we only have PPIs, so * all values are less than 32. */ ppis |= BIT(irq); } valid = hweight32(ppis) == nr_timers(vcpu); if (valid) set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags); mutex_unlock(&vcpu->kvm->arch.config_lock); return valid; } static bool kvm_arch_timer_get_input_level(int vintid) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); if (WARN(!vcpu, "No vcpu context!\n")) return false; for (int i = 0; i < nr_timers(vcpu); i++) { struct arch_timer_context *ctx; ctx = vcpu_get_timer(vcpu, i); if (timer_irq(ctx) == vintid) return kvm_timer_should_fire(ctx); } /* A timer IRQ has fired, but no matching timer was found? */ WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid); return false; } int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; int ret; if (timer->enabled) return 0; /* Without a VGIC we do not map virtual IRQs to physical IRQs */ if (!irqchip_in_kernel(vcpu->kvm)) goto no_vgic; /* * At this stage, we have the guarantee that the vgic is both * available and initialized. */ if (!timer_irqs_are_valid(vcpu)) { kvm_debug("incorrectly configured timer irqs\n"); return -EINVAL; } get_timer_map(vcpu, &map); ret = kvm_vgic_map_phys_irq(vcpu, map.direct_vtimer->host_timer_irq, timer_irq(map.direct_vtimer), &arch_timer_irq_ops); if (ret) return ret; if (map.direct_ptimer) { ret = kvm_vgic_map_phys_irq(vcpu, map.direct_ptimer->host_timer_irq, timer_irq(map.direct_ptimer), &arch_timer_irq_ops); } if (ret) return ret; no_vgic: timer->enabled = 1; return 0; } /* If we have CNTPOFF, permanently set ECV to enable it */ void kvm_timer_init_vhe(void) { if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)) sysreg_clear_set(cnthctl_el2, 0, CNTHCTL_ECV); } int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int __user *uaddr = (int __user *)(long)attr->addr; int irq, idx, ret = 0; if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; if (get_user(irq, uaddr)) return -EFAULT; if (!(irq_is_ppi(irq))) return -EINVAL; mutex_lock(&vcpu->kvm->arch.config_lock); if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags)) { ret = -EBUSY; goto out; } switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: idx = TIMER_VTIMER; break; case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: idx = TIMER_PTIMER; break; case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: idx = TIMER_HVTIMER; break; case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: idx = TIMER_HPTIMER; break; default: ret = -ENXIO; goto out; } /* * We cannot validate the IRQ unicity before we run, so take it at * face value. The verdict will be given on first vcpu run, for each * vcpu. Yes this is late. Blame it on the stupid API. */ vcpu->kvm->arch.timer_data.ppi[idx] = irq; out: mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; } int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int __user *uaddr = (int __user *)(long)attr->addr; struct arch_timer_context *timer; int irq; switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: timer = vcpu_vtimer(vcpu); break; case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: timer = vcpu_ptimer(vcpu); break; case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: timer = vcpu_hvtimer(vcpu); break; case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: timer = vcpu_hptimer(vcpu); break; default: return -ENXIO; } irq = timer_irq(timer); return put_user(irq, uaddr); } int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_TIMER_IRQ_VTIMER: case KVM_ARM_VCPU_TIMER_IRQ_PTIMER: case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER: case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER: return 0; } return -ENXIO; } int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, struct kvm_arm_counter_offset *offset) { int ret = 0; if (offset->reserved) return -EINVAL; mutex_lock(&kvm->lock); if (lock_all_vcpus(kvm)) { set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags); /* * If userspace decides to set the offset using this * API rather than merely restoring the counter * values, the offset applies to both the virtual and * physical views. */ kvm->arch.timer_data.voffset = offset->counter_offset; kvm->arch.timer_data.poffset = offset->counter_offset; unlock_all_vcpus(kvm); } else { ret = -EBUSY; } mutex_unlock(&kvm->lock); return ret; } |
1227 1224 1227 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/audit.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /** * tomoyo_print_bprm - Print "struct linux_binprm" for auditing. * * @bprm: Pointer to "struct linux_binprm". * @dump: Pointer to "struct tomoyo_page_dump". * * Returns the contents of @bprm on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_bprm(struct linux_binprm *bprm, struct tomoyo_page_dump *dump) { static const int tomoyo_buffer_len = 4096 * 2; char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS); char *cp; char *last_start; int len; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool truncated = false; if (!buffer) return NULL; len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ "); cp = buffer + len; if (!argv_count) { memmove(cp, "} envp[]={ ", 11); cp += 11; } last_start = cp; while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (offset < PAGE_SIZE) { const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (cp == last_start) *cp++ = '"'; if (cp >= buffer + tomoyo_buffer_len - 32) { /* Reserve some room for "..." string. */ truncated = true; } else if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else if (!c) { *cp++ = '"'; *cp++ = ' '; last_start = cp; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } if (c) continue; if (argv_count) { if (--argv_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } memmove(cp, "} envp[]={ ", 11); cp += 11; last_start = cp; truncated = false; } } else if (envp_count) { if (--envp_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } } } if (!argv_count && !envp_count) break; } offset = 0; } *cp++ = '}'; *cp = '\0'; return buffer; out: snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ... } envp[]= { ... }"); return buffer; } /** * tomoyo_filetype - Get string representation of file type. * * @mode: Mode value for stat(). * * Returns file type string. */ static inline const char *tomoyo_filetype(const umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case 0: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE]; case S_IFDIR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY]; case S_IFLNK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK]; case S_IFIFO: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO]; case S_IFSOCK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET]; case S_IFBLK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV]; case S_IFCHR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV]; } return "unknown"; /* This should not happen. */ } /** * tomoyo_print_header - Get header line of audit log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns string representation. * * This function uses kmalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_header(struct tomoyo_request_info *r) { struct tomoyo_time stamp; const pid_t gpid = task_pid_nr(current); struct tomoyo_obj_info *obj = r->obj; static const int tomoyo_buffer_len = 4096; char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS); int pos; u8 i; if (!buffer) return NULL; tomoyo_convert_time(ktime_get_real_seconds(), &stamp); pos = snprintf(buffer, tomoyo_buffer_len - 1, "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode], str_yes_no(r->granted), gpid, tomoyo_sys_getpid(), tomoyo_sys_getppid(), from_kuid(&init_user_ns, current_uid()), from_kgid(&init_user_ns, current_gid()), from_kuid(&init_user_ns, current_euid()), from_kgid(&init_user_ns, current_egid()), from_kuid(&init_user_ns, current_suid()), from_kgid(&init_user_ns, current_sgid()), from_kuid(&init_user_ns, current_fsuid()), from_kgid(&init_user_ns, current_fsgid())); if (!obj) goto no_obj_info; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct tomoyo_mini_stat *stat; unsigned int dev; umode_t mode; if (!obj->stat_valid[i]) continue; stat = &obj->stat[i]; dev = stat->dev; mode = stat->mode; if (i & 1) { pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, stat->mode & S_IALLUGO); continue; } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, MAJOR(dev), MINOR(dev), mode & S_IALLUGO, tomoyo_filetype(mode)); if (S_ISCHR(mode) || S_ISBLK(mode)) { dev = stat->rdev; pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " dev_major=%u dev_minor=%u", MAJOR(dev), MINOR(dev)); } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " }"); } no_obj_info: if (pos < tomoyo_buffer_len - 1) return buffer; kfree(buffer); return NULL; } /** * tomoyo_init_log - Allocate buffer for audit logs. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns pointer to allocated memory. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf = NULL; char *bprm_info = NULL; const char *header = NULL; char *realpath = NULL; const char *symlink = NULL; int pos; const char *domainname = r->domain->domainname->name; header = tomoyo_print_header(r); if (!header) return NULL; /* +10 is for '\n' etc. and '\0'. */ len += strlen(domainname) + strlen(header) + 10; if (r->ee) { struct file *file = r->ee->bprm->file; realpath = tomoyo_realpath_from_path(&file->f_path); bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump); if (!realpath || !bprm_info) goto out; /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */ len += strlen(realpath) + 80 + strlen(bprm_info); } else if (r->obj && r->obj->symlink_target) { symlink = r->obj->symlink_target->name; /* +18 is for " symlink.target=\"%s\"" */ len += 18 + strlen(symlink); } len = kmalloc_size_roundup(len); buf = kzalloc(len, GFP_NOFS); if (!buf) goto out; len--; pos = snprintf(buf, len, "%s", header); if (realpath) { struct linux_binprm *bprm = r->ee->bprm; pos += snprintf(buf + pos, len - pos, " exec={ realpath=\"%s\" argc=%d envc=%d %s }", realpath, bprm->argc, bprm->envc, bprm_info); } else if (symlink) pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"", symlink); pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname); vsnprintf(buf + pos, len - pos, fmt, args); out: kfree(realpath); kfree(bprm_info); kfree(header); return buf; } /* Wait queue for /sys/kernel/security/tomoyo/audit. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait); /* Structure for audit log. */ struct tomoyo_log { struct list_head list; char *log; int size; }; /* The list for "struct tomoyo_log". */ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); /* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** * tomoyo_get_audit - Get audit mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * @matched_acl: Pointer to "struct tomoyo_acl_info". * @is_granted: True if granted log, false otherwise. * * Returns true if this request should be audited, false otherwise. */ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index, const struct tomoyo_acl_info *matched_acl, const bool is_granted) { u8 mode; const u8 category = tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return false; p = tomoyo_profile(ns, profile); if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG]) return false; if (is_granted && matched_acl && matched_acl->cond && matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO) return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES; mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[category]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; if (is_granted) return mode & TOMOYO_CONFIG_WANT_GRANT_LOG; return mode & TOMOYO_CONFIG_WANT_REJECT_LOG; } /** * tomoyo_write_log2 - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns nothing. */ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf; struct tomoyo_log *entry; bool quota_exceeded = false; if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type, r->matched_acl, r->granted)) goto out; buf = tomoyo_init_log(r, len, fmt, args); if (!buf) goto out; entry = kzalloc(sizeof(*entry), GFP_NOFS); if (!entry) { kfree(buf); goto out; } entry->log = buf; len = kmalloc_size_roundup(strlen(buf) + 1); /* * The entry->size is used for memory quota checks. * Don't go beyond strlen(entry->log). */ entry->size = len + kmalloc_size_roundup(sizeof(*entry)); spin_lock(&tomoyo_log_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] && tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >= tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) { quota_exceeded = true; } else { tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size; list_add_tail(&entry->list, &tomoyo_log); tomoyo_log_count++; } spin_unlock(&tomoyo_log_lock); if (quota_exceeded) { kfree(buf); kfree(entry); goto out; } wake_up(&tomoyo_log_wait); out: return; } /** * tomoyo_write_log - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int len; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); } /** * tomoyo_read_log - Read an audit log. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ void tomoyo_read_log(struct tomoyo_io_buffer *head) { struct tomoyo_log *ptr = NULL; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_log_lock); if (!list_empty(&tomoyo_log)) { ptr = list_entry(tomoyo_log.next, typeof(*ptr), list); list_del(&ptr->list); tomoyo_log_count--; tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size; } spin_unlock(&tomoyo_log_lock); if (ptr) { head->read_buf = ptr->log; head->r.w[head->r.w_pos++] = head->read_buf; kfree(ptr); } } /** * tomoyo_poll_log - Wait for an audit log. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log. */ __poll_t tomoyo_poll_log(struct file *file, poll_table *wait) { if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_log_wait, wait); if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; return 0; } |
490 5 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timestamp #if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMESTAMP_H #include <linux/tracepoint.h> #include <linux/fs.h> #define CTIME_QUERIED_FLAGS \ { I_CTIME_QUERIED, "Q" } DECLARE_EVENT_CLASS(ctime, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(u32, ctime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns ) ); DEFINE_EVENT(ctime, inode_set_ctime_to_ts, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); DEFINE_EVENT(ctime, ctime_xchg_skip, TP_PROTO(struct inode *inode, struct timespec64 *ctime), TP_ARGS(inode, ctime)); TRACE_EVENT(ctime_ns_xchg, TP_PROTO(struct inode *inode, u32 old, u32 new, u32 cur), TP_ARGS(inode, old, new, cur), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(u32, gen) __field(u32, old) __field(u32, new) __field(u32, cur) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->old = old; __entry->new = new; __entry->cur = cur; ), TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->old & ~I_CTIME_QUERIED, __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), __entry->new, __entry->cur & ~I_CTIME_QUERIED, __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) ) ); TRACE_EVENT(fill_mg_cmtime, TP_PROTO(struct inode *inode, struct timespec64 *ctime, struct timespec64 *mtime), TP_ARGS(inode, ctime, mtime), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(time64_t, ctime_s) __field(time64_t, mtime_s) __field(u32, ctime_ns) __field(u32, mtime_ns) __field(u32, gen) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->gen = inode->i_generation; __entry->ctime_s = ctime->tv_sec; __entry->mtime_s = mtime->tv_sec; __entry->ctime_ns = ctime->tv_nsec; __entry->mtime_ns = mtime->tv_nsec; ), TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, __entry->ctime_s, __entry->ctime_ns, __entry->mtime_s, __entry->mtime_ns ) ); #endif /* _TRACE_TIMESTAMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
723 688 211 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_COUNTER_H #define _LINUX_PERCPU_COUNTER_H /* * A simple "approximate counter" for use in ext2 and ext3 superblocks. * * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4. */ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/list.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX #ifdef CONFIG_SMP struct percpu_counter { raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif s32 __percpu *counters; }; extern int percpu_counter_batch; int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key); #define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ &__key); \ }) #define percpu_counter_init(fbc, value, gfp) \ percpu_counter_init_many(fbc, value, gfp, 1) void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters); static inline void percpu_counter_destroy(struct percpu_counter *fbc) { percpu_counter_destroy_many(fbc, 1); } void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { return __percpu_counter_compare(fbc, rhs, percpu_counter_batch); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { return __percpu_counter_limited_add(fbc, limit, amount, percpu_counter_batch); } /* * With percpu_counter_add_local() and percpu_counter_sub_local(), counts * are accumulated in local per cpu counter and not in fbc->count until * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter * write efficient. * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be * used to add up the counts from each CPU to account for all the local * counts. So percpu_counter_add_local() and percpu_counter_sub_local() * should be used when a counter is updated frequently and read rarely. */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); return ret < 0 ? 0 : ret; } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return __percpu_counter_sum(fbc); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * It is possible for the percpu_counter_read() to return a small negative * number for some counter which should never be negative. * */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { /* Prevent reloads of fbc->count */ s64 ret = READ_ONCE(fbc->count); if (ret >= 0) return ret; return 0; } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return (fbc->counters != NULL); } #else /* !CONFIG_SMP */ struct percpu_counter { s64 count; }; static inline int percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters) { u32 i; for (i = 0; i < nr_counters; i++) fbc[i].count = amount; return 0; } static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { return percpu_counter_init_many(fbc, amount, gfp, 1); } static inline void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { } static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { fbc->count = amount; } static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { if (fbc->count > rhs) return 1; else if (fbc->count < rhs) return -1; else return 0; } static inline int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { return percpu_counter_compare(fbc, rhs); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { unsigned long flags; local_irq_save(flags); fbc->count += amount; local_irq_restore(flags); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { unsigned long flags; bool good = false; s64 count; if (amount == 0) return true; local_irq_save(flags); count = fbc->count + amount; if ((amount > 0 && count <= limit) || (amount < 0 && count >= limit)) { fbc->count = count; good = true; } local_irq_restore(flags); return good; } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, amount); } static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { percpu_counter_add(fbc, amount); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * percpu_counter is intended to track positive numbers. In the UP case the * number should never be negative. */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { return fbc->count; } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { return percpu_counter_read_positive(fbc); } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return percpu_counter_read(fbc); } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; } static inline void percpu_counter_sync(struct percpu_counter *fbc) { } #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) { percpu_counter_add(fbc, 1); } static inline void percpu_counter_dec(struct percpu_counter *fbc) { percpu_counter_add(fbc, -1); } static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, -amount); } static inline void percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_local(fbc, -amount); } #endif /* _LINUX_PERCPU_COUNTER_H */ |
332 333 334 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/mm/mmap.c * * Copyright (C) 2012 ARM Ltd. */ #include <linux/io.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/types.h> #include <asm/cpufeature.h> #include <asm/page.h> static pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = PAGE_NONE, [VM_READ] = PAGE_READONLY, [VM_WRITE] = PAGE_READONLY, [VM_WRITE | VM_READ] = PAGE_READONLY, /* PAGE_EXECONLY if Enhanced PAN */ [VM_EXEC] = PAGE_READONLY_EXEC, [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, [VM_EXEC | VM_WRITE] = PAGE_READONLY_EXEC, [VM_EXEC | VM_WRITE | VM_READ] = PAGE_READONLY_EXEC, [VM_SHARED] = PAGE_NONE, [VM_SHARED | VM_READ] = PAGE_READONLY, [VM_SHARED | VM_WRITE] = PAGE_SHARED, [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, /* PAGE_EXECONLY if Enhanced PAN */ [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. */ int valid_phys_addr_range(phys_addr_t addr, size_t size) { /* * Check whether addr is covered by a memory region without the * MEMBLOCK_NOMAP attribute, and whether that region covers the * entire range. In theory, this could lead to false negatives * if the range is covered by distinct but adjacent memory regions * that only differ in other attributes. However, few of such * attributes have been defined, and it is debatable whether it * follows that /dev/mem read() calls should be able traverse * such boundaries. */ return memblock_is_region_memory(addr, size) && memblock_is_map_memory(addr); } /* * Do not allow /dev/mem mappings beyond the supported physical range. */ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK); } static int __init adjust_protection_map(void) { /* * With Enhanced PAN we can honour the execute-only permissions as * there is no PAN override with such mappings. */ if (cpus_have_cap(ARM64_HAS_EPAN)) { protection_map[VM_EXEC] = PAGE_EXECONLY; protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } if (lpa2_is_enabled()) for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; return 0; } arch_initcall(adjust_protection_map); pgprot_t vm_get_page_prot(unsigned long vm_flags) { pteval_t prot; /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { prot = _PAGE_GCS_RO; } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); } if (vm_flags & VM_ARM64_BTI) prot |= PTE_GP; /* * There are two conditions required for returning a Normal Tagged * memory type: (1) the user requested it via PROT_MTE passed to * mmap() or mprotect() and (2) the corresponding vma supports MTE. We * register (1) as VM_MTE in the vma->vm_flags and (2) as * VM_MTE_ALLOWED. Note that the latter can only be set during the * mmap() call since mprotect() does not accept MAP_* flags. * Checking for VM_MTE only is sufficient since arch_validate_flags() * does not permit (VM_MTE & !VM_MTE_ALLOWED). */ if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); #ifdef CONFIG_ARCH_HAS_PKEYS if (system_supports_poe()) { if (vm_flags & VM_PKEY_BIT0) prot |= PTE_PO_IDX_0; if (vm_flags & VM_PKEY_BIT1) prot |= PTE_PO_IDX_1; if (vm_flags & VM_PKEY_BIT2) prot |= PTE_PO_IDX_2; } #endif return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot); |
2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 | /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) return; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) return; } } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup); |
16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin <levinsasha928@gmail.com> */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include <linux/list.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/rculist.h> #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] __read_mostly = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilog2(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_add_rcu - add an object to a rcu enabled hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add_rcu(hashtable, node, key) \ hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_del_rcu - remove an object from a rcu enabled hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del_rcu(struct hlist_node *node) { hlist_del_init_rcu(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_rcu - iterate over a rcu enabled hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_rcu(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_rcu(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct hlist_node used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_rcu - iterate over all possible objects hashing to the * same bucket in an rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_rcu(name, obj, member, key, cond...) \ hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ member, ## cond) /** * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over * * This is the same as hash_for_each_possible_rcu() except that it does * not do any RCU debugging or tracing. */ #define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ hlist_for_each_entry_rcu_notrace(obj, \ &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct hlist_node used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif |
131 132 131 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | // SPDX-License-Identifier: GPL-2.0 /* * Based on arch/arm/mm/extable.c */ #include <linux/bitfield.h> #include <linux/extable.h> #include <linux/uaccess.h> #include <asm/asm-extable.h> #include <asm/ptrace.h> static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { return ((unsigned long)&ex->fixup + ex->fixup); } static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, struct pt_regs *regs) { int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data); pt_regs_write_reg(regs, reg_err, -EFAULT); pt_regs_write_reg(regs, reg_zero, 0); regs->pc = get_ex_fixup(ex); return true; } static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data); int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned long data, addr, offset; addr = pt_regs_read_reg(regs, reg_addr); offset = addr & 0x7UL; addr &= ~0x7UL; data = *(unsigned long*)addr; #ifndef __AARCH64EB__ data >>= 8 * offset; #else data <<= 8 * offset; #endif pt_regs_write_reg(regs, reg_data, data); regs->pc = get_ex_fixup(ex); return true; } bool fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *ex; ex = search_exception_tables(instruction_pointer(regs)); if (!ex) return false; switch (ex->type) { case EX_TYPE_BPF: return ex_handler_bpf(ex, regs); case EX_TYPE_UACCESS_ERR_ZERO: case EX_TYPE_KACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: return ex_handler_load_unaligned_zeropad(ex, regs); } BUG(); } |
8 8 8 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for AMBA serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright 1999 ARM Limited * Copyright (C) 2000 Deep Blue Solutions Ltd. * Copyright (C) 2010 ST-Ericsson SA * * This is a generic driver for ARM AMBA-type serial ports. They * have a lot of 16550-like features, but are not register compatible. * Note that although they do have CTS, DCD and DSR inputs, they do * not have an RI input, nor do they have DTR or RTS outputs. If * required, these have to be supplied via some other means (eg, GPIO) * and hooked into this driver. */ #include <linux/module.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/console.h> #include <linux/platform_device.h> #include <linux/sysrq.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/serial_core.h> #include <linux/serial.h> #include <linux/amba/bus.h> #include <linux/amba/serial.h> #include <linux/clk.h> #include <linux/slab.h> #include <linux/dmaengine.h> #include <linux/dma-mapping.h> #include <linux/scatterlist.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/sizes.h> #include <linux/io.h> #include <linux/acpi.h> #define UART_NR 14 #define SERIAL_AMBA_MAJOR 204 #define SERIAL_AMBA_MINOR 64 #define SERIAL_AMBA_NR UART_NR #define AMBA_ISR_PASS_LIMIT 256 #define UART_DR_ERROR (UART011_DR_OE | UART011_DR_BE | UART011_DR_PE | UART011_DR_FE) #define UART_DUMMY_DR_RX BIT(16) enum { REG_DR, REG_ST_DMAWM, REG_ST_TIMEOUT, REG_FR, REG_LCRH_RX, REG_LCRH_TX, REG_IBRD, REG_FBRD, REG_CR, REG_IFLS, REG_IMSC, REG_RIS, REG_MIS, REG_ICR, REG_DMACR, REG_ST_XFCR, REG_ST_XON1, REG_ST_XON2, REG_ST_XOFF1, REG_ST_XOFF2, REG_ST_ITCR, REG_ST_ITIP, REG_ST_ABCR, REG_ST_ABIMSC, /* The size of the array - must be last */ REG_ARRAY_SIZE, }; static u16 pl011_std_offsets[REG_ARRAY_SIZE] = { [REG_DR] = UART01x_DR, [REG_FR] = UART01x_FR, [REG_LCRH_RX] = UART011_LCRH, [REG_LCRH_TX] = UART011_LCRH, [REG_IBRD] = UART011_IBRD, [REG_FBRD] = UART011_FBRD, [REG_CR] = UART011_CR, [REG_IFLS] = UART011_IFLS, [REG_IMSC] = UART011_IMSC, [REG_RIS] = UART011_RIS, [REG_MIS] = UART011_MIS, [REG_ICR] = UART011_ICR, [REG_DMACR] = UART011_DMACR, }; /* There is by now at least one vendor with differing details, so handle it */ struct vendor_data { const u16 *reg_offset; unsigned int ifls; unsigned int fr_busy; unsigned int fr_dsr; unsigned int fr_cts; unsigned int fr_ri; unsigned int inv_fr; bool access_32b; bool oversampling; bool dma_threshold; bool cts_event_workaround; bool always_enabled; bool fixed_options; unsigned int (*get_fifosize)(struct amba_device *dev); }; static unsigned int get_fifosize_arm(struct amba_device *dev) { return amba_rev(dev) < 3 ? 16 : 32; } static struct vendor_data vendor_arm = { .reg_offset = pl011_std_offsets, .ifls = UART011_IFLS_RX4_8 | UART011_IFLS_TX4_8, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = false, .fixed_options = false, .get_fifosize = get_fifosize_arm, }; static const struct vendor_data vendor_sbsa = { .reg_offset = pl011_std_offsets, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .access_32b = true, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = true, .fixed_options = true, }; #ifdef CONFIG_ACPI_SPCR_TABLE static const struct vendor_data vendor_qdt_qdf2400_e44 = { .reg_offset = pl011_std_offsets, .fr_busy = UART011_FR_TXFE, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .inv_fr = UART011_FR_TXFE, .access_32b = true, .oversampling = false, .dma_threshold = false, .cts_event_workaround = false, .always_enabled = true, .fixed_options = true, }; #endif static u16 pl011_st_offsets[REG_ARRAY_SIZE] = { [REG_DR] = UART01x_DR, [REG_ST_DMAWM] = ST_UART011_DMAWM, [REG_ST_TIMEOUT] = ST_UART011_TIMEOUT, [REG_FR] = UART01x_FR, [REG_LCRH_RX] = ST_UART011_LCRH_RX, [REG_LCRH_TX] = ST_UART011_LCRH_TX, [REG_IBRD] = UART011_IBRD, [REG_FBRD] = UART011_FBRD, [REG_CR] = UART011_CR, [REG_IFLS] = UART011_IFLS, [REG_IMSC] = UART011_IMSC, [REG_RIS] = UART011_RIS, [REG_MIS] = UART011_MIS, [REG_ICR] = UART011_ICR, [REG_DMACR] = UART011_DMACR, [REG_ST_XFCR] = ST_UART011_XFCR, [REG_ST_XON1] = ST_UART011_XON1, [REG_ST_XON2] = ST_UART011_XON2, [REG_ST_XOFF1] = ST_UART011_XOFF1, [REG_ST_XOFF2] = ST_UART011_XOFF2, [REG_ST_ITCR] = ST_UART011_ITCR, [REG_ST_ITIP] = ST_UART011_ITIP, [REG_ST_ABCR] = ST_UART011_ABCR, [REG_ST_ABIMSC] = ST_UART011_ABIMSC, }; static unsigned int get_fifosize_st(struct amba_device *dev) { return 64; } static struct vendor_data vendor_st = { .reg_offset = pl011_st_offsets, .ifls = UART011_IFLS_RX_HALF | UART011_IFLS_TX_HALF, .fr_busy = UART01x_FR_BUSY, .fr_dsr = UART01x_FR_DSR, .fr_cts = UART01x_FR_CTS, .fr_ri = UART011_FR_RI, .oversampling = true, .dma_threshold = true, .cts_event_workaround = true, .always_enabled = false, .fixed_options = false, .get_fifosize = get_fifosize_st, }; /* Deals with DMA transactions */ struct pl011_dmabuf { dma_addr_t dma; size_t len; char *buf; }; struct pl011_dmarx_data { struct dma_chan *chan; struct completion complete; bool use_buf_b; struct pl011_dmabuf dbuf_a; struct pl011_dmabuf dbuf_b; dma_cookie_t cookie; bool running; struct timer_list timer; unsigned int last_residue; unsigned long last_jiffies; bool auto_poll_rate; unsigned int poll_rate; unsigned int poll_timeout; }; struct pl011_dmatx_data { struct dma_chan *chan; dma_addr_t dma; size_t len; char *buf; bool queued; }; enum pl011_rs485_tx_state { OFF, WAIT_AFTER_RTS, SEND, WAIT_AFTER_SEND, }; /* * We wrap our port structure around the generic uart_port. */ struct uart_amba_port { struct uart_port port; const u16 *reg_offset; struct clk *clk; const struct vendor_data *vendor; unsigned int im; /* interrupt mask */ unsigned int old_status; unsigned int fifosize; /* vendor-specific */ unsigned int fixed_baud; /* vendor-set fixed baud rate */ char type[12]; ktime_t rs485_tx_drain_interval; /* nano */ enum pl011_rs485_tx_state rs485_tx_state; struct hrtimer trigger_start_tx; struct hrtimer trigger_stop_tx; #ifdef CONFIG_DMA_ENGINE /* DMA stuff */ unsigned int dmacr; /* dma control reg */ bool using_tx_dma; bool using_rx_dma; struct pl011_dmarx_data dmarx; struct pl011_dmatx_data dmatx; bool dma_probed; #endif }; static unsigned int pl011_tx_empty(struct uart_port *port); static unsigned int pl011_reg_to_offset(const struct uart_amba_port *uap, unsigned int reg) { return uap->reg_offset[reg]; } static unsigned int pl011_read(const struct uart_amba_port *uap, unsigned int reg) { void __iomem *addr = uap->port.membase + pl011_reg_to_offset(uap, reg); return (uap->port.iotype == UPIO_MEM32) ? readl_relaxed(addr) : readw_relaxed(addr); } static void pl011_write(unsigned int val, const struct uart_amba_port *uap, unsigned int reg) { void __iomem *addr = uap->port.membase + pl011_reg_to_offset(uap, reg); if (uap->port.iotype == UPIO_MEM32) writel_relaxed(val, addr); else writew_relaxed(val, addr); } /* * Reads up to 256 characters from the FIFO or until it's empty and * inserts them into the TTY layer. Returns the number of characters * read from the FIFO. */ static int pl011_fifo_to_tty(struct uart_amba_port *uap) { unsigned int ch, fifotaken; int sysrq; u16 status; u8 flag; for (fifotaken = 0; fifotaken != 256; fifotaken++) { status = pl011_read(uap, REG_FR); if (status & UART01x_FR_RXFE) break; /* Take chars from the FIFO and update status */ ch = pl011_read(uap, REG_DR) | UART_DUMMY_DR_RX; flag = TTY_NORMAL; uap->port.icount.rx++; if (unlikely(ch & UART_DR_ERROR)) { if (ch & UART011_DR_BE) { ch &= ~(UART011_DR_FE | UART011_DR_PE); uap->port.icount.brk++; if (uart_handle_break(&uap->port)) continue; } else if (ch & UART011_DR_PE) { uap->port.icount.parity++; } else if (ch & UART011_DR_FE) { uap->port.icount.frame++; } if (ch & UART011_DR_OE) uap->port.icount.overrun++; ch &= uap->port.read_status_mask; if (ch & UART011_DR_BE) flag = TTY_BREAK; else if (ch & UART011_DR_PE) flag = TTY_PARITY; else if (ch & UART011_DR_FE) flag = TTY_FRAME; } sysrq = uart_prepare_sysrq_char(&uap->port, ch & 255); if (!sysrq) uart_insert_char(&uap->port, ch, UART011_DR_OE, ch, flag); } return fifotaken; } /* * All the DMA operation mode stuff goes inside this ifdef. * This assumes that you have a generic DMA device interface, * no custom DMA interfaces are supported. */ #ifdef CONFIG_DMA_ENGINE #define PL011_DMA_BUFFER_SIZE PAGE_SIZE static int pl011_dmabuf_init(struct dma_chan *chan, struct pl011_dmabuf *db, enum dma_data_direction dir) { db->buf = dma_alloc_coherent(chan->device->dev, PL011_DMA_BUFFER_SIZE, &db->dma, GFP_KERNEL); if (!db->buf) return -ENOMEM; db->len = PL011_DMA_BUFFER_SIZE; return 0; } static void pl011_dmabuf_free(struct dma_chan *chan, struct pl011_dmabuf *db, enum dma_data_direction dir) { if (db->buf) { dma_free_coherent(chan->device->dev, PL011_DMA_BUFFER_SIZE, db->buf, db->dma); } } static void pl011_dma_probe(struct uart_amba_port *uap) { /* DMA is the sole user of the platform data right now */ struct amba_pl011_data *plat = dev_get_platdata(uap->port.dev); struct device *dev = uap->port.dev; struct dma_slave_config tx_conf = { .dst_addr = uap->port.mapbase + pl011_reg_to_offset(uap, REG_DR), .dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, .direction = DMA_MEM_TO_DEV, .dst_maxburst = uap->fifosize >> 1, .device_fc = false, }; struct dma_chan *chan; dma_cap_mask_t mask; uap->dma_probed = true; chan = dma_request_chan(dev, "tx"); if (IS_ERR(chan)) { if (PTR_ERR(chan) == -EPROBE_DEFER) { uap->dma_probed = false; return; } /* We need platform data */ if (!plat || !plat->dma_filter) { dev_dbg(uap->port.dev, "no DMA platform data\n"); return; } /* Try to acquire a generic DMA engine slave TX channel */ dma_cap_zero(mask); dma_cap_set(DMA_SLAVE, mask); chan = dma_request_channel(mask, plat->dma_filter, plat->dma_tx_param); if (!chan) { dev_err(uap->port.dev, "no TX DMA channel!\n"); return; } } dmaengine_slave_config(chan, &tx_conf); uap->dmatx.chan = chan; dev_info(uap->port.dev, "DMA channel TX %s\n", dma_chan_name(uap->dmatx.chan)); /* Optionally make use of an RX channel as well */ chan = dma_request_chan(dev, "rx"); if (IS_ERR(chan) && plat && plat->dma_rx_param) { chan = dma_request_channel(mask, plat->dma_filter, plat->dma_rx_param); if (!chan) { dev_err(uap->port.dev, "no RX DMA channel!\n"); return; } } if (!IS_ERR(chan)) { struct dma_slave_config rx_conf = { .src_addr = uap->port.mapbase + pl011_reg_to_offset(uap, REG_DR), .src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, .direction = DMA_DEV_TO_MEM, .src_maxburst = uap->fifosize >> 2, .device_fc = false, }; struct dma_slave_caps caps; /* * Some DMA controllers provide information on their capabilities. * If the controller does, check for suitable residue processing * otherwise assime all is well. */ if (dma_get_slave_caps(chan, &caps) == 0) { if (caps.residue_granularity == DMA_RESIDUE_GRANULARITY_DESCRIPTOR) { dma_release_channel(chan); dev_info(uap->port.dev, "RX DMA disabled - no residue processing\n"); return; } } dmaengine_slave_config(chan, &rx_conf); uap->dmarx.chan = chan; uap->dmarx.auto_poll_rate = false; if (plat && plat->dma_rx_poll_enable) { /* Set poll rate if specified. */ if (plat->dma_rx_poll_rate) { uap->dmarx.auto_poll_rate = false; uap->dmarx.poll_rate = plat->dma_rx_poll_rate; } else { /* * 100 ms defaults to poll rate if not * specified. This will be adjusted with * the baud rate at set_termios. */ uap->dmarx.auto_poll_rate = true; uap->dmarx.poll_rate = 100; } /* 3 secs defaults poll_timeout if not specified. */ if (plat->dma_rx_poll_timeout) uap->dmarx.poll_timeout = plat->dma_rx_poll_timeout; else uap->dmarx.poll_timeout = 3000; } else if (!plat && dev->of_node) { uap->dmarx.auto_poll_rate = of_property_read_bool(dev->of_node, "auto-poll"); if (uap->dmarx.auto_poll_rate) { u32 x; if (of_property_read_u32(dev->of_node, "poll-rate-ms", &x) == 0) uap->dmarx.poll_rate = x; else uap->dmarx.poll_rate = 100; if (of_property_read_u32(dev->of_node, "poll-timeout-ms", &x) == 0) uap->dmarx.poll_timeout = x; else uap->dmarx.poll_timeout = 3000; } } dev_info(uap->port.dev, "DMA channel RX %s\n", dma_chan_name(uap->dmarx.chan)); } } static void pl011_dma_remove(struct uart_amba_port *uap) { if (uap->dmatx.chan) dma_release_channel(uap->dmatx.chan); if (uap->dmarx.chan) dma_release_channel(uap->dmarx.chan); } /* Forward declare these for the refill routine */ static int pl011_dma_tx_refill(struct uart_amba_port *uap); static void pl011_start_tx_pio(struct uart_amba_port *uap); /* * The current DMA TX buffer has been sent. * Try to queue up another DMA buffer. */ static void pl011_dma_tx_callback(void *data) { struct uart_amba_port *uap = data; struct tty_port *tport = &uap->port.state->port; struct pl011_dmatx_data *dmatx = &uap->dmatx; unsigned long flags; u16 dmacr; uart_port_lock_irqsave(&uap->port, &flags); if (uap->dmatx.queued) dma_unmap_single(dmatx->chan->device->dev, dmatx->dma, dmatx->len, DMA_TO_DEVICE); dmacr = uap->dmacr; uap->dmacr = dmacr & ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); /* * If TX DMA was disabled, it means that we've stopped the DMA for * some reason (eg, XOFF received, or we want to send an X-char.) * * Note: we need to be careful here of a potential race between DMA * and the rest of the driver - if the driver disables TX DMA while * a TX buffer completing, we must update the tx queued status to * get further refills (hence we check dmacr). */ if (!(dmacr & UART011_TXDMAE) || uart_tx_stopped(&uap->port) || kfifo_is_empty(&tport->xmit_fifo)) { uap->dmatx.queued = false; uart_port_unlock_irqrestore(&uap->port, flags); return; } if (pl011_dma_tx_refill(uap) <= 0) /* * We didn't queue a DMA buffer for some reason, but we * have data pending to be sent. Re-enable the TX IRQ. */ pl011_start_tx_pio(uap); uart_port_unlock_irqrestore(&uap->port, flags); } /* * Try to refill the TX DMA buffer. * Locking: called with port lock held and IRQs disabled. * Returns: * 1 if we queued up a TX DMA buffer. * 0 if we didn't want to handle this by DMA * <0 on error */ static int pl011_dma_tx_refill(struct uart_amba_port *uap) { struct pl011_dmatx_data *dmatx = &uap->dmatx; struct dma_chan *chan = dmatx->chan; struct dma_device *dma_dev = chan->device; struct dma_async_tx_descriptor *desc; struct tty_port *tport = &uap->port.state->port; unsigned int count; /* * Try to avoid the overhead involved in using DMA if the * transaction fits in the first half of the FIFO, by using * the standard interrupt handling. This ensures that we * issue a uart_write_wakeup() at the appropriate time. */ count = kfifo_len(&tport->xmit_fifo); if (count < (uap->fifosize >> 1)) { uap->dmatx.queued = false; return 0; } /* * Bodge: don't send the last character by DMA, as this * will prevent XON from notifying us to restart DMA. */ count -= 1; /* Else proceed to copy the TX chars to the DMA buffer and fire DMA */ if (count > PL011_DMA_BUFFER_SIZE) count = PL011_DMA_BUFFER_SIZE; count = kfifo_out_peek(&tport->xmit_fifo, dmatx->buf, count); dmatx->len = count; dmatx->dma = dma_map_single(dma_dev->dev, dmatx->buf, count, DMA_TO_DEVICE); if (dmatx->dma == DMA_MAPPING_ERROR) { uap->dmatx.queued = false; dev_dbg(uap->port.dev, "unable to map TX DMA\n"); return -EBUSY; } desc = dmaengine_prep_slave_single(chan, dmatx->dma, dmatx->len, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dma_unmap_single(dma_dev->dev, dmatx->dma, dmatx->len, DMA_TO_DEVICE); uap->dmatx.queued = false; /* * If DMA cannot be used right now, we complete this * transaction via IRQ and let the TTY layer retry. */ dev_dbg(uap->port.dev, "TX DMA busy\n"); return -EBUSY; } /* Some data to go along to the callback */ desc->callback = pl011_dma_tx_callback; desc->callback_param = uap; /* All errors should happen at prepare time */ dmaengine_submit(desc); /* Fire the DMA transaction */ dma_dev->device_issue_pending(chan); uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmatx.queued = true; /* * Now we know that DMA will fire, so advance the ring buffer * with the stuff we just dispatched. */ uart_xmit_advance(&uap->port, count); if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(&uap->port); return 1; } /* * We received a transmit interrupt without a pending X-char but with * pending characters. * Locking: called with port lock held and IRQs disabled. * Returns: * false if we want to use PIO to transmit * true if we queued a DMA buffer */ static bool pl011_dma_tx_irq(struct uart_amba_port *uap) { if (!uap->using_tx_dma) return false; /* * If we already have a TX buffer queued, but received a * TX interrupt, it will be because we've just sent an X-char. * Ensure the TX DMA is enabled and the TX IRQ is disabled. */ if (uap->dmatx.queued) { uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); return true; } /* * We don't have a TX buffer queued, so try to queue one. * If we successfully queued a buffer, mask the TX IRQ. */ if (pl011_dma_tx_refill(uap) > 0) { uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); return true; } return false; } /* * Stop the DMA transmit (eg, due to received XOFF). * Locking: called with port lock held and IRQs disabled. */ static inline void pl011_dma_tx_stop(struct uart_amba_port *uap) { if (uap->dmatx.queued) { uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } } /* * Try to start a DMA transmit, or in the case of an XON/OFF * character queued for send, try to get that character out ASAP. * Locking: called with port lock held and IRQs disabled. * Returns: * false if we want the TX IRQ to be enabled * true if we have a buffer queued */ static inline bool pl011_dma_tx_start(struct uart_amba_port *uap) { u16 dmacr; if (!uap->using_tx_dma) return false; if (!uap->port.x_char) { /* no X-char, try to push chars out in DMA mode */ bool ret = true; if (!uap->dmatx.queued) { if (pl011_dma_tx_refill(uap) > 0) { uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); } else { ret = false; } } else if (!(uap->dmacr & UART011_TXDMAE)) { uap->dmacr |= UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } return ret; } /* * We have an X-char to send. Disable DMA to prevent it loading * the TX fifo, and then see if we can stuff it into the FIFO. */ dmacr = uap->dmacr; uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); if (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) { /* * No space in the FIFO, so enable the transmit interrupt * so we know when there is space. Note that once we've * loaded the character, we should just re-enable DMA. */ return false; } pl011_write(uap->port.x_char, uap, REG_DR); uap->port.icount.tx++; uap->port.x_char = 0; /* Success - restore the DMA state */ uap->dmacr = dmacr; pl011_write(dmacr, uap, REG_DMACR); return true; } /* * Flush the transmit buffer. * Locking: called with port lock held and IRQs disabled. */ static void pl011_dma_flush_buffer(struct uart_port *port) __releases(&uap->port.lock) __acquires(&uap->port.lock) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (!uap->using_tx_dma) return; dmaengine_terminate_async(uap->dmatx.chan); if (uap->dmatx.queued) { dma_unmap_single(uap->dmatx.chan->device->dev, uap->dmatx.dma, uap->dmatx.len, DMA_TO_DEVICE); uap->dmatx.queued = false; uap->dmacr &= ~UART011_TXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } } static void pl011_dma_rx_callback(void *data); static int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap) { struct dma_chan *rxchan = uap->dmarx.chan; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_async_tx_descriptor *desc; struct pl011_dmabuf *dbuf; if (!rxchan) return -EIO; /* Start the RX DMA job */ dbuf = uap->dmarx.use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; desc = dmaengine_prep_slave_single(rxchan, dbuf->dma, dbuf->len, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); /* * If the DMA engine is busy and cannot prepare a * channel, no big deal, the driver will fall back * to interrupt mode as a result of this error code. */ if (!desc) { uap->dmarx.running = false; dmaengine_terminate_all(rxchan); return -EBUSY; } /* Some data to go along to the callback */ desc->callback = pl011_dma_rx_callback; desc->callback_param = uap; dmarx->cookie = dmaengine_submit(desc); dma_async_issue_pending(rxchan); uap->dmacr |= UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmarx.running = true; uap->im &= ~UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); return 0; } /* * This is called when either the DMA job is complete, or * the FIFO timeout interrupt occurred. This must be called * with the port spinlock uap->port.lock held. */ static void pl011_dma_rx_chars(struct uart_amba_port *uap, u32 pending, bool use_buf_b, bool readfifo) { struct tty_port *port = &uap->port.state->port; struct pl011_dmabuf *dbuf = use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; int dma_count = 0; u32 fifotaken = 0; /* only used for vdbg() */ struct pl011_dmarx_data *dmarx = &uap->dmarx; int dmataken = 0; if (uap->dmarx.poll_rate) { /* The data can be taken by polling */ dmataken = dbuf->len - dmarx->last_residue; /* Recalculate the pending size */ if (pending >= dmataken) pending -= dmataken; } /* Pick the remain data from the DMA */ if (pending) { /* * First take all chars in the DMA pipe, then look in the FIFO. * Note that tty_insert_flip_buf() tries to take as many chars * as it can. */ dma_count = tty_insert_flip_string(port, dbuf->buf + dmataken, pending); uap->port.icount.rx += dma_count; if (dma_count < pending) dev_warn(uap->port.dev, "couldn't insert all characters (TTY is full?)\n"); } /* Reset the last_residue for Rx DMA poll */ if (uap->dmarx.poll_rate) dmarx->last_residue = dbuf->len; /* * Only continue with trying to read the FIFO if all DMA chars have * been taken first. */ if (dma_count == pending && readfifo) { /* Clear any error flags */ pl011_write(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS, uap, REG_ICR); /* * If we read all the DMA'd characters, and we had an * incomplete buffer, that could be due to an rx error, or * maybe we just timed out. Read any pending chars and check * the error status. * * Error conditions will only occur in the FIFO, these will * trigger an immediate interrupt and stop the DMA job, so we * will always find the error in the FIFO, never in the DMA * buffer. */ fifotaken = pl011_fifo_to_tty(uap); } dev_vdbg(uap->port.dev, "Took %d chars from DMA buffer and %d chars from the FIFO\n", dma_count, fifotaken); tty_flip_buffer_push(port); } static void pl011_dma_rx_irq(struct uart_amba_port *uap) { struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = dmarx->chan; struct pl011_dmabuf *dbuf = dmarx->use_buf_b ? &dmarx->dbuf_b : &dmarx->dbuf_a; size_t pending; struct dma_tx_state state; enum dma_status dmastat; /* * Pause the transfer so we can trust the current counter, * do this before we pause the PL011 block, else we may * overflow the FIFO. */ if (dmaengine_pause(rxchan)) dev_err(uap->port.dev, "unable to pause DMA transfer\n"); dmastat = rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); if (dmastat != DMA_PAUSED) dev_err(uap->port.dev, "unable to pause DMA transfer\n"); /* Disable RX DMA - incoming data will wait in the FIFO */ uap->dmacr &= ~UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); uap->dmarx.running = false; pending = dbuf->len - state.residue; BUG_ON(pending > PL011_DMA_BUFFER_SIZE); /* Then we terminate the transfer - we now know our residue */ dmaengine_terminate_all(rxchan); /* * This will take the chars we have so far and insert * into the framework. */ pl011_dma_rx_chars(uap, pending, dmarx->use_buf_b, true); /* Switch buffer & re-trigger DMA job */ dmarx->use_buf_b = !dmarx->use_buf_b; if (pl011_dma_rx_trigger_dma(uap)) { dev_dbg(uap->port.dev, "could not retrigger RX DMA job fall back to interrupt mode\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } } static void pl011_dma_rx_callback(void *data) { struct uart_amba_port *uap = data; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = dmarx->chan; bool lastbuf = dmarx->use_buf_b; struct pl011_dmabuf *dbuf = dmarx->use_buf_b ? &dmarx->dbuf_b : &dmarx->dbuf_a; size_t pending; struct dma_tx_state state; int ret; /* * This completion interrupt occurs typically when the * RX buffer is totally stuffed but no timeout has yet * occurred. When that happens, we just want the RX * routine to flush out the secondary DMA buffer while * we immediately trigger the next DMA job. */ uart_port_lock_irq(&uap->port); /* * Rx data can be taken by the UART interrupts during * the DMA irq handler. So we check the residue here. */ rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); pending = dbuf->len - state.residue; BUG_ON(pending > PL011_DMA_BUFFER_SIZE); /* Then we terminate the transfer - we now know our residue */ dmaengine_terminate_all(rxchan); uap->dmarx.running = false; dmarx->use_buf_b = !lastbuf; ret = pl011_dma_rx_trigger_dma(uap); pl011_dma_rx_chars(uap, pending, lastbuf, false); uart_unlock_and_check_sysrq(&uap->port); /* * Do this check after we picked the DMA chars so we don't * get some IRQ immediately from RX. */ if (ret) { dev_dbg(uap->port.dev, "could not retrigger RX DMA job fall back to interrupt mode\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } } /* * Stop accepting received characters, when we're shutting down or * suspending this port. * Locking: called with port lock held and IRQs disabled. */ static inline void pl011_dma_rx_stop(struct uart_amba_port *uap) { if (!uap->using_rx_dma) return; /* FIXME. Just disable the DMA enable */ uap->dmacr &= ~UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } /* * Timer handler for Rx DMA polling. * Every polling, It checks the residue in the dma buffer and transfer * data to the tty. Also, last_residue is updated for the next polling. */ static void pl011_dma_rx_poll(struct timer_list *t) { struct uart_amba_port *uap = from_timer(uap, t, dmarx.timer); struct tty_port *port = &uap->port.state->port; struct pl011_dmarx_data *dmarx = &uap->dmarx; struct dma_chan *rxchan = uap->dmarx.chan; unsigned long flags; unsigned int dmataken = 0; unsigned int size = 0; struct pl011_dmabuf *dbuf; int dma_count; struct dma_tx_state state; dbuf = dmarx->use_buf_b ? &uap->dmarx.dbuf_b : &uap->dmarx.dbuf_a; rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state); if (likely(state.residue < dmarx->last_residue)) { dmataken = dbuf->len - dmarx->last_residue; size = dmarx->last_residue - state.residue; dma_count = tty_insert_flip_string(port, dbuf->buf + dmataken, size); if (dma_count == size) dmarx->last_residue = state.residue; dmarx->last_jiffies = jiffies; } tty_flip_buffer_push(port); /* * If no data is received in poll_timeout, the driver will fall back * to interrupt mode. We will retrigger DMA at the first interrupt. */ if (jiffies_to_msecs(jiffies - dmarx->last_jiffies) > uap->dmarx.poll_timeout) { uart_port_lock_irqsave(&uap->port, &flags); pl011_dma_rx_stop(uap); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); uart_port_unlock_irqrestore(&uap->port, flags); uap->dmarx.running = false; dmaengine_terminate_all(rxchan); del_timer(&uap->dmarx.timer); } else { mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); } } static void pl011_dma_startup(struct uart_amba_port *uap) { int ret; if (!uap->dma_probed) pl011_dma_probe(uap); if (!uap->dmatx.chan) return; uap->dmatx.buf = kmalloc(PL011_DMA_BUFFER_SIZE, GFP_KERNEL | __GFP_DMA); if (!uap->dmatx.buf) { uap->port.fifosize = uap->fifosize; return; } uap->dmatx.len = PL011_DMA_BUFFER_SIZE; /* The DMA buffer is now the FIFO the TTY subsystem can use */ uap->port.fifosize = PL011_DMA_BUFFER_SIZE; uap->using_tx_dma = true; if (!uap->dmarx.chan) goto skip_rx; /* Allocate and map DMA RX buffers */ ret = pl011_dmabuf_init(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); if (ret) { dev_err(uap->port.dev, "failed to init DMA %s: %d\n", "RX buffer A", ret); goto skip_rx; } ret = pl011_dmabuf_init(uap->dmarx.chan, &uap->dmarx.dbuf_b, DMA_FROM_DEVICE); if (ret) { dev_err(uap->port.dev, "failed to init DMA %s: %d\n", "RX buffer B", ret); pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); goto skip_rx; } uap->using_rx_dma = true; skip_rx: /* Turn on DMA error (RX/TX will be enabled on demand) */ uap->dmacr |= UART011_DMAONERR; pl011_write(uap->dmacr, uap, REG_DMACR); /* * ST Micro variants has some specific dma burst threshold * compensation. Set this to 16 bytes, so burst will only * be issued above/below 16 bytes. */ if (uap->vendor->dma_threshold) pl011_write(ST_UART011_DMAWM_RX_16 | ST_UART011_DMAWM_TX_16, uap, REG_ST_DMAWM); if (uap->using_rx_dma) { if (pl011_dma_rx_trigger_dma(uap)) dev_dbg(uap->port.dev, "could not trigger initial RX DMA job, fall back to interrupt mode\n"); if (uap->dmarx.poll_rate) { timer_setup(&uap->dmarx.timer, pl011_dma_rx_poll, 0); mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); uap->dmarx.last_residue = PL011_DMA_BUFFER_SIZE; uap->dmarx.last_jiffies = jiffies; } } } static void pl011_dma_shutdown(struct uart_amba_port *uap) { if (!(uap->using_tx_dma || uap->using_rx_dma)) return; /* Disable RX and TX DMA */ while (pl011_read(uap, REG_FR) & uap->vendor->fr_busy) cpu_relax(); uart_port_lock_irq(&uap->port); uap->dmacr &= ~(UART011_DMAONERR | UART011_RXDMAE | UART011_TXDMAE); pl011_write(uap->dmacr, uap, REG_DMACR); uart_port_unlock_irq(&uap->port); if (uap->using_tx_dma) { /* In theory, this should already be done by pl011_dma_flush_buffer */ dmaengine_terminate_all(uap->dmatx.chan); if (uap->dmatx.queued) { dma_unmap_single(uap->dmatx.chan->device->dev, uap->dmatx.dma, uap->dmatx.len, DMA_TO_DEVICE); uap->dmatx.queued = false; } kfree(uap->dmatx.buf); uap->using_tx_dma = false; } if (uap->using_rx_dma) { dmaengine_terminate_all(uap->dmarx.chan); /* Clean up the RX DMA */ pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_a, DMA_FROM_DEVICE); pl011_dmabuf_free(uap->dmarx.chan, &uap->dmarx.dbuf_b, DMA_FROM_DEVICE); if (uap->dmarx.poll_rate) del_timer_sync(&uap->dmarx.timer); uap->using_rx_dma = false; } } static inline bool pl011_dma_rx_available(struct uart_amba_port *uap) { return uap->using_rx_dma; } static inline bool pl011_dma_rx_running(struct uart_amba_port *uap) { return uap->using_rx_dma && uap->dmarx.running; } #else /* Blank functions if the DMA engine is not available */ static inline void pl011_dma_remove(struct uart_amba_port *uap) { } static inline void pl011_dma_startup(struct uart_amba_port *uap) { } static inline void pl011_dma_shutdown(struct uart_amba_port *uap) { } static inline bool pl011_dma_tx_irq(struct uart_amba_port *uap) { return false; } static inline void pl011_dma_tx_stop(struct uart_amba_port *uap) { } static inline bool pl011_dma_tx_start(struct uart_amba_port *uap) { return false; } static inline void pl011_dma_rx_irq(struct uart_amba_port *uap) { } static inline void pl011_dma_rx_stop(struct uart_amba_port *uap) { } static inline int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap) { return -EIO; } static inline bool pl011_dma_rx_available(struct uart_amba_port *uap) { return false; } static inline bool pl011_dma_rx_running(struct uart_amba_port *uap) { return false; } #define pl011_dma_flush_buffer NULL #endif static void pl011_rs485_tx_stop(struct uart_amba_port *uap) { struct uart_port *port = &uap->port; u32 cr; if (uap->rs485_tx_state == SEND) uap->rs485_tx_state = WAIT_AFTER_SEND; if (uap->rs485_tx_state == WAIT_AFTER_SEND) { /* Schedule hrtimer if tx queue not empty */ if (!pl011_tx_empty(port)) { hrtimer_start(&uap->trigger_stop_tx, uap->rs485_tx_drain_interval, HRTIMER_MODE_REL); return; } if (port->rs485.delay_rts_after_send > 0) { hrtimer_start(&uap->trigger_stop_tx, ms_to_ktime(port->rs485.delay_rts_after_send), HRTIMER_MODE_REL); return; } /* Continue without any delay */ } else if (uap->rs485_tx_state == WAIT_AFTER_RTS) { hrtimer_try_to_cancel(&uap->trigger_start_tx); } cr = pl011_read(uap, REG_CR); if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) cr &= ~UART011_CR_RTS; else cr |= UART011_CR_RTS; /* Disable the transmitter and reenable the transceiver */ cr &= ~UART011_CR_TXE; cr |= UART011_CR_RXE; pl011_write(cr, uap, REG_CR); uap->rs485_tx_state = OFF; } static void pl011_stop_tx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state == WAIT_AFTER_RTS) { pl011_rs485_tx_stop(uap); return; } uap->im &= ~UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); pl011_dma_tx_stop(uap); if (port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state != OFF) pl011_rs485_tx_stop(uap); } static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq); /* Start TX with programmed I/O only (no DMA) */ static void pl011_start_tx_pio(struct uart_amba_port *uap) { if (pl011_tx_chars(uap, false)) { uap->im |= UART011_TXIM; pl011_write(uap->im, uap, REG_IMSC); } } static void pl011_rs485_tx_start(struct uart_amba_port *uap) { struct uart_port *port = &uap->port; u32 cr; if (uap->rs485_tx_state == WAIT_AFTER_RTS) { uap->rs485_tx_state = SEND; return; } if (uap->rs485_tx_state == WAIT_AFTER_SEND) { hrtimer_try_to_cancel(&uap->trigger_stop_tx); uap->rs485_tx_state = SEND; return; } /* uap->rs485_tx_state == OFF */ /* Enable transmitter */ cr = pl011_read(uap, REG_CR); cr |= UART011_CR_TXE; /* Disable receiver if half-duplex */ if (!(port->rs485.flags & SER_RS485_RX_DURING_TX)) cr &= ~UART011_CR_RXE; if (port->rs485.flags & SER_RS485_RTS_ON_SEND) cr &= ~UART011_CR_RTS; else cr |= UART011_CR_RTS; pl011_write(cr, uap, REG_CR); if (port->rs485.delay_rts_before_send > 0) { uap->rs485_tx_state = WAIT_AFTER_RTS; hrtimer_start(&uap->trigger_start_tx, ms_to_ktime(port->rs485.delay_rts_before_send), HRTIMER_MODE_REL); } else { uap->rs485_tx_state = SEND; } } static void pl011_start_tx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if ((uap->port.rs485.flags & SER_RS485_ENABLED) && uap->rs485_tx_state != SEND) { pl011_rs485_tx_start(uap); if (uap->rs485_tx_state == WAIT_AFTER_RTS) return; } if (!pl011_dma_tx_start(uap)) pl011_start_tx_pio(uap); } static enum hrtimer_restart pl011_trigger_start_tx(struct hrtimer *t) { struct uart_amba_port *uap = container_of(t, struct uart_amba_port, trigger_start_tx); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); if (uap->rs485_tx_state == WAIT_AFTER_RTS) pl011_start_tx(&uap->port); uart_port_unlock_irqrestore(&uap->port, flags); return HRTIMER_NORESTART; } static enum hrtimer_restart pl011_trigger_stop_tx(struct hrtimer *t) { struct uart_amba_port *uap = container_of(t, struct uart_amba_port, trigger_stop_tx); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); if (uap->rs485_tx_state == WAIT_AFTER_SEND) pl011_rs485_tx_stop(uap); uart_port_unlock_irqrestore(&uap->port, flags); return HRTIMER_NORESTART; } static void pl011_stop_rx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); uap->im &= ~(UART011_RXIM | UART011_RTIM | UART011_FEIM | UART011_PEIM | UART011_BEIM | UART011_OEIM); pl011_write(uap->im, uap, REG_IMSC); pl011_dma_rx_stop(uap); } static void pl011_throttle_rx(struct uart_port *port) { unsigned long flags; uart_port_lock_irqsave(port, &flags); pl011_stop_rx(port); uart_port_unlock_irqrestore(port, flags); } static void pl011_enable_ms(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); uap->im |= UART011_RIMIM | UART011_CTSMIM | UART011_DCDMIM | UART011_DSRMIM; pl011_write(uap->im, uap, REG_IMSC); } static void pl011_rx_chars(struct uart_amba_port *uap) __releases(&uap->port.lock) __acquires(&uap->port.lock) { pl011_fifo_to_tty(uap); uart_port_unlock(&uap->port); tty_flip_buffer_push(&uap->port.state->port); /* * If we were temporarily out of DMA mode for a while, * attempt to switch back to DMA mode again. */ if (pl011_dma_rx_available(uap)) { if (pl011_dma_rx_trigger_dma(uap)) { dev_dbg(uap->port.dev, "could not trigger RX DMA job fall back to interrupt mode again\n"); uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); } else { #ifdef CONFIG_DMA_ENGINE /* Start Rx DMA poll */ if (uap->dmarx.poll_rate) { uap->dmarx.last_jiffies = jiffies; uap->dmarx.last_residue = PL011_DMA_BUFFER_SIZE; mod_timer(&uap->dmarx.timer, jiffies + msecs_to_jiffies(uap->dmarx.poll_rate)); } #endif } } uart_port_lock(&uap->port); } static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c, bool from_irq) { if (unlikely(!from_irq) && pl011_read(uap, REG_FR) & UART01x_FR_TXFF) return false; /* unable to transmit character */ pl011_write(c, uap, REG_DR); uap->port.icount.tx++; return true; } /* Returns true if tx interrupts have to be (kept) enabled */ static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq) { struct tty_port *tport = &uap->port.state->port; int count = uap->fifosize >> 1; if (uap->port.x_char) { if (!pl011_tx_char(uap, uap->port.x_char, from_irq)) return true; uap->port.x_char = 0; --count; } if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(&uap->port)) { pl011_stop_tx(&uap->port); return false; } /* If we are using DMA mode, try to send some characters. */ if (pl011_dma_tx_irq(uap)) return true; while (1) { unsigned char c; if (likely(from_irq) && count-- == 0) break; if (!kfifo_peek(&tport->xmit_fifo, &c)) break; if (!pl011_tx_char(uap, c, from_irq)) break; kfifo_skip(&tport->xmit_fifo); } if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS) uart_write_wakeup(&uap->port); if (kfifo_is_empty(&tport->xmit_fifo)) { pl011_stop_tx(&uap->port); return false; } return true; } static void pl011_modem_status(struct uart_amba_port *uap) { unsigned int status, delta; status = pl011_read(uap, REG_FR) & UART01x_FR_MODEM_ANY; delta = status ^ uap->old_status; uap->old_status = status; if (!delta) return; if (delta & UART01x_FR_DCD) uart_handle_dcd_change(&uap->port, status & UART01x_FR_DCD); if (delta & uap->vendor->fr_dsr) uap->port.icount.dsr++; if (delta & uap->vendor->fr_cts) uart_handle_cts_change(&uap->port, status & uap->vendor->fr_cts); wake_up_interruptible(&uap->port.state->port.delta_msr_wait); } static void check_apply_cts_event_workaround(struct uart_amba_port *uap) { if (!uap->vendor->cts_event_workaround) return; /* workaround to make sure that all bits are unlocked.. */ pl011_write(0x00, uap, REG_ICR); /* * WA: introduce 26ns(1 uart clk) delay before W1C; * single apb access will incur 2 pclk(133.12Mhz) delay, * so add 2 dummy reads */ pl011_read(uap, REG_ICR); pl011_read(uap, REG_ICR); } static irqreturn_t pl011_int(int irq, void *dev_id) { struct uart_amba_port *uap = dev_id; unsigned int status, pass_counter = AMBA_ISR_PASS_LIMIT; int handled = 0; uart_port_lock(&uap->port); status = pl011_read(uap, REG_RIS) & uap->im; if (status) { do { check_apply_cts_event_workaround(uap); pl011_write(status & ~(UART011_TXIS | UART011_RTIS | UART011_RXIS), uap, REG_ICR); if (status & (UART011_RTIS | UART011_RXIS)) { if (pl011_dma_rx_running(uap)) pl011_dma_rx_irq(uap); else pl011_rx_chars(uap); } if (status & (UART011_DSRMIS | UART011_DCDMIS | UART011_CTSMIS | UART011_RIMIS)) pl011_modem_status(uap); if (status & UART011_TXIS) pl011_tx_chars(uap, true); if (pass_counter-- == 0) break; status = pl011_read(uap, REG_RIS) & uap->im; } while (status != 0); handled = 1; } uart_unlock_and_check_sysrq(&uap->port); return IRQ_RETVAL(handled); } static unsigned int pl011_tx_empty(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); /* Allow feature register bits to be inverted to work around errata */ unsigned int status = pl011_read(uap, REG_FR) ^ uap->vendor->inv_fr; return status & (uap->vendor->fr_busy | UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT; } static void pl011_maybe_set_bit(bool cond, unsigned int *ptr, unsigned int mask) { if (cond) *ptr |= mask; } static unsigned int pl011_get_mctrl(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int result = 0; unsigned int status = pl011_read(uap, REG_FR); pl011_maybe_set_bit(status & UART01x_FR_DCD, &result, TIOCM_CAR); pl011_maybe_set_bit(status & uap->vendor->fr_dsr, &result, TIOCM_DSR); pl011_maybe_set_bit(status & uap->vendor->fr_cts, &result, TIOCM_CTS); pl011_maybe_set_bit(status & uap->vendor->fr_ri, &result, TIOCM_RNG); return result; } static void pl011_assign_bit(bool cond, unsigned int *ptr, unsigned int mask) { if (cond) *ptr |= mask; else *ptr &= ~mask; } static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int cr; cr = pl011_read(uap, REG_CR); pl011_assign_bit(mctrl & TIOCM_RTS, &cr, UART011_CR_RTS); pl011_assign_bit(mctrl & TIOCM_DTR, &cr, UART011_CR_DTR); pl011_assign_bit(mctrl & TIOCM_OUT1, &cr, UART011_CR_OUT1); pl011_assign_bit(mctrl & TIOCM_OUT2, &cr, UART011_CR_OUT2); pl011_assign_bit(mctrl & TIOCM_LOOP, &cr, UART011_CR_LBE); if (port->status & UPSTAT_AUTORTS) { /* We need to disable auto-RTS if we want to turn RTS off */ pl011_assign_bit(mctrl & TIOCM_RTS, &cr, UART011_CR_RTSEN); } pl011_write(cr, uap, REG_CR); } static void pl011_break_ctl(struct uart_port *port, int break_state) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; unsigned int lcr_h; uart_port_lock_irqsave(&uap->port, &flags); lcr_h = pl011_read(uap, REG_LCRH_TX); if (break_state == -1) lcr_h |= UART01x_LCRH_BRK; else lcr_h &= ~UART01x_LCRH_BRK; pl011_write(lcr_h, uap, REG_LCRH_TX); uart_port_unlock_irqrestore(&uap->port, flags); } #ifdef CONFIG_CONSOLE_POLL static void pl011_quiesce_irqs(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_write(pl011_read(uap, REG_MIS), uap, REG_ICR); /* * There is no way to clear TXIM as this is "ready to transmit IRQ", so * we simply mask it. start_tx() will unmask it. * * Note we can race with start_tx(), and if the race happens, the * polling user might get another interrupt just after we clear it. * But it should be OK and can happen even w/o the race, e.g. * controller immediately got some new data and raised the IRQ. * * And whoever uses polling routines assumes that it manages the device * (including tx queue), so we're also fine with start_tx()'s caller * side. */ pl011_write(pl011_read(uap, REG_IMSC) & ~UART011_TXIM, uap, REG_IMSC); } static int pl011_get_poll_char(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int status; /* * The caller might need IRQs lowered, e.g. if used with KDB NMI * debugger. */ pl011_quiesce_irqs(port); status = pl011_read(uap, REG_FR); if (status & UART01x_FR_RXFE) return NO_POLL_CHAR; return pl011_read(uap, REG_DR); } static void pl011_put_poll_char(struct uart_port *port, unsigned char ch) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); while (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) cpu_relax(); pl011_write(ch, uap, REG_DR); } #endif /* CONFIG_CONSOLE_POLL */ static int pl011_hwinit(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); int retval; /* Optionaly enable pins to be muxed in and configured */ pinctrl_pm_select_default_state(port->dev); /* * Try to enable the clock producer. */ retval = clk_prepare_enable(uap->clk); if (retval) return retval; uap->port.uartclk = clk_get_rate(uap->clk); /* Clear pending error and receive interrupts */ pl011_write(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS | UART011_RTIS | UART011_RXIS, uap, REG_ICR); /* * Save interrupts enable mask, and enable RX interrupts in case if * the interrupt is used for NMI entry. */ uap->im = pl011_read(uap, REG_IMSC); pl011_write(UART011_RTIM | UART011_RXIM, uap, REG_IMSC); if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->init) plat->init(); } return 0; } static bool pl011_split_lcrh(const struct uart_amba_port *uap) { return pl011_reg_to_offset(uap, REG_LCRH_RX) != pl011_reg_to_offset(uap, REG_LCRH_TX); } static void pl011_write_lcr_h(struct uart_amba_port *uap, unsigned int lcr_h) { pl011_write(lcr_h, uap, REG_LCRH_RX); if (pl011_split_lcrh(uap)) { int i; /* * Wait 10 PCLKs before writing LCRH_TX register, * to get this delay write read only register 10 times */ for (i = 0; i < 10; ++i) pl011_write(0xff, uap, REG_MIS); pl011_write(lcr_h, uap, REG_LCRH_TX); } } static int pl011_allocate_irq(struct uart_amba_port *uap) { pl011_write(uap->im, uap, REG_IMSC); return request_irq(uap->port.irq, pl011_int, IRQF_SHARED, "uart-pl011", uap); } /* * Enable interrupts, only timeouts when using DMA * if initial RX DMA job failed, start in interrupt mode * as well. */ static void pl011_enable_interrupts(struct uart_amba_port *uap) { unsigned long flags; unsigned int i; uart_port_lock_irqsave(&uap->port, &flags); /* Clear out any spuriously appearing RX interrupts */ pl011_write(UART011_RTIS | UART011_RXIS, uap, REG_ICR); /* * RXIS is asserted only when the RX FIFO transitions from below * to above the trigger threshold. If the RX FIFO is already * full to the threshold this can't happen and RXIS will now be * stuck off. Drain the RX FIFO explicitly to fix this: */ for (i = 0; i < uap->fifosize * 2; ++i) { if (pl011_read(uap, REG_FR) & UART01x_FR_RXFE) break; pl011_read(uap, REG_DR); } uap->im = UART011_RTIM; if (!pl011_dma_rx_running(uap)) uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); uart_port_unlock_irqrestore(&uap->port, flags); } static void pl011_unthrottle_rx(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; uart_port_lock_irqsave(&uap->port, &flags); uap->im = UART011_RTIM; if (!pl011_dma_rx_running(uap)) uap->im |= UART011_RXIM; pl011_write(uap->im, uap, REG_IMSC); #ifdef CONFIG_DMA_ENGINE if (uap->using_rx_dma) { uap->dmacr |= UART011_RXDMAE; pl011_write(uap->dmacr, uap, REG_DMACR); } #endif uart_port_unlock_irqrestore(&uap->port, flags); } static int pl011_startup(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int cr; int retval; retval = pl011_hwinit(port); if (retval) goto clk_dis; retval = pl011_allocate_irq(uap); if (retval) goto clk_dis; pl011_write(uap->vendor->ifls, uap, REG_IFLS); uart_port_lock_irq(&uap->port); cr = pl011_read(uap, REG_CR); cr &= UART011_CR_RTS | UART011_CR_DTR; cr |= UART01x_CR_UARTEN | UART011_CR_RXE; if (!(port->rs485.flags & SER_RS485_ENABLED)) cr |= UART011_CR_TXE; pl011_write(cr, uap, REG_CR); uart_port_unlock_irq(&uap->port); /* * initialise the old status of the modem signals */ uap->old_status = pl011_read(uap, REG_FR) & UART01x_FR_MODEM_ANY; /* Startup DMA */ pl011_dma_startup(uap); pl011_enable_interrupts(uap); return 0; clk_dis: clk_disable_unprepare(uap->clk); return retval; } static int sbsa_uart_startup(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); int retval; retval = pl011_hwinit(port); if (retval) return retval; retval = pl011_allocate_irq(uap); if (retval) return retval; /* The SBSA UART does not support any modem status lines. */ uap->old_status = 0; pl011_enable_interrupts(uap); return 0; } static void pl011_shutdown_channel(struct uart_amba_port *uap, unsigned int lcrh) { unsigned long val; val = pl011_read(uap, lcrh); val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN); pl011_write(val, uap, lcrh); } /* * disable the port. It should not disable RTS and DTR. * Also RTS and DTR state should be preserved to restore * it during startup(). */ static void pl011_disable_uart(struct uart_amba_port *uap) { unsigned int cr; uap->port.status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS); uart_port_lock_irq(&uap->port); cr = pl011_read(uap, REG_CR); cr &= UART011_CR_RTS | UART011_CR_DTR; cr |= UART01x_CR_UARTEN | UART011_CR_TXE; pl011_write(cr, uap, REG_CR); uart_port_unlock_irq(&uap->port); /* * disable break condition and fifos */ pl011_shutdown_channel(uap, REG_LCRH_RX); if (pl011_split_lcrh(uap)) pl011_shutdown_channel(uap, REG_LCRH_TX); } static void pl011_disable_interrupts(struct uart_amba_port *uap) { uart_port_lock_irq(&uap->port); /* mask all interrupts and clear all pending ones */ uap->im = 0; pl011_write(uap->im, uap, REG_IMSC); pl011_write(0xffff, uap, REG_ICR); uart_port_unlock_irq(&uap->port); } static void pl011_shutdown(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_disable_interrupts(uap); pl011_dma_shutdown(uap); if ((port->rs485.flags & SER_RS485_ENABLED && uap->rs485_tx_state != OFF)) pl011_rs485_tx_stop(uap); free_irq(uap->port.irq, uap); pl011_disable_uart(uap); /* * Shut down the clock producer */ clk_disable_unprepare(uap->clk); /* Optionally let pins go into sleep states */ pinctrl_pm_select_sleep_state(port->dev); if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->exit) plat->exit(); } if (uap->port.ops->flush_buffer) uap->port.ops->flush_buffer(port); } static void sbsa_uart_shutdown(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); pl011_disable_interrupts(uap); free_irq(uap->port.irq, uap); if (uap->port.ops->flush_buffer) uap->port.ops->flush_buffer(port); } static void pl011_setup_status_masks(struct uart_port *port, struct ktermios *termios) { port->read_status_mask = UART011_DR_OE | 255; if (termios->c_iflag & INPCK) port->read_status_mask |= UART011_DR_FE | UART011_DR_PE; if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK)) port->read_status_mask |= UART011_DR_BE; /* * Characters to ignore */ port->ignore_status_mask = 0; if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART011_DR_FE | UART011_DR_PE; if (termios->c_iflag & IGNBRK) { port->ignore_status_mask |= UART011_DR_BE; /* * If we're ignoring parity and break indicators, * ignore overruns too (for real raw support). */ if (termios->c_iflag & IGNPAR) port->ignore_status_mask |= UART011_DR_OE; } /* * Ignore all characters if CREAD is not set. */ if ((termios->c_cflag & CREAD) == 0) port->ignore_status_mask |= UART_DUMMY_DR_RX; } static void pl011_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned int lcr_h, old_cr; unsigned long flags; unsigned int baud, quot, clkdiv; unsigned int bits; if (uap->vendor->oversampling) clkdiv = 8; else clkdiv = 16; /* * Ask the core to calculate the divisor for us. */ baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / clkdiv); #ifdef CONFIG_DMA_ENGINE /* * Adjust RX DMA polling rate with baud rate if not specified. */ if (uap->dmarx.auto_poll_rate) uap->dmarx.poll_rate = DIV_ROUND_UP(10000000, baud); #endif if (baud > port->uartclk / 16) quot = DIV_ROUND_CLOSEST(port->uartclk * 8, baud); else quot = DIV_ROUND_CLOSEST(port->uartclk * 4, baud); switch (termios->c_cflag & CSIZE) { case CS5: lcr_h = UART01x_LCRH_WLEN_5; break; case CS6: lcr_h = UART01x_LCRH_WLEN_6; break; case CS7: lcr_h = UART01x_LCRH_WLEN_7; break; default: // CS8 lcr_h = UART01x_LCRH_WLEN_8; break; } if (termios->c_cflag & CSTOPB) lcr_h |= UART01x_LCRH_STP2; if (termios->c_cflag & PARENB) { lcr_h |= UART01x_LCRH_PEN; if (!(termios->c_cflag & PARODD)) lcr_h |= UART01x_LCRH_EPS; if (termios->c_cflag & CMSPAR) lcr_h |= UART011_LCRH_SPS; } if (uap->fifosize > 1) lcr_h |= UART01x_LCRH_FEN; bits = tty_get_frame_size(termios->c_cflag); uart_port_lock_irqsave(port, &flags); /* * Update the per-port timeout. */ uart_update_timeout(port, termios->c_cflag, baud); /* * Calculate the approximated time it takes to transmit one character * with the given baud rate. We use this as the poll interval when we * wait for the tx queue to empty. */ uap->rs485_tx_drain_interval = ns_to_ktime(DIV_ROUND_UP(bits * NSEC_PER_SEC, baud)); pl011_setup_status_masks(port, termios); if (UART_ENABLE_MS(port, termios->c_cflag)) pl011_enable_ms(port); if (port->rs485.flags & SER_RS485_ENABLED) termios->c_cflag &= ~CRTSCTS; old_cr = pl011_read(uap, REG_CR); if (termios->c_cflag & CRTSCTS) { if (old_cr & UART011_CR_RTS) old_cr |= UART011_CR_RTSEN; old_cr |= UART011_CR_CTSEN; port->status |= UPSTAT_AUTOCTS | UPSTAT_AUTORTS; } else { old_cr &= ~(UART011_CR_CTSEN | UART011_CR_RTSEN); port->status &= ~(UPSTAT_AUTOCTS | UPSTAT_AUTORTS); } if (uap->vendor->oversampling) { if (baud > port->uartclk / 16) old_cr |= ST_UART011_CR_OVSFACT; else old_cr &= ~ST_UART011_CR_OVSFACT; } /* * Workaround for the ST Micro oversampling variants to * increase the bitrate slightly, by lowering the divisor, * to avoid delayed sampling of start bit at high speeds, * else we see data corruption. */ if (uap->vendor->oversampling) { if (baud >= 3000000 && baud < 3250000 && quot > 1) quot -= 1; else if (baud > 3250000 && quot > 2) quot -= 2; } /* Set baud rate */ pl011_write(quot & 0x3f, uap, REG_FBRD); pl011_write(quot >> 6, uap, REG_IBRD); /* * ----------v----------v----------v----------v----- * NOTE: REG_LCRH_TX and REG_LCRH_RX MUST BE WRITTEN AFTER * REG_FBRD & REG_IBRD. * ----------^----------^----------^----------^----- */ pl011_write_lcr_h(uap, lcr_h); /* * Receive was disabled by pl011_disable_uart during shutdown. * Need to reenable receive if you need to use a tty_driver * returns from tty_find_polling_driver() after a port shutdown. */ old_cr |= UART011_CR_RXE; pl011_write(old_cr, uap, REG_CR); uart_port_unlock_irqrestore(port, flags); } static void sbsa_uart_set_termios(struct uart_port *port, struct ktermios *termios, const struct ktermios *old) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); unsigned long flags; tty_termios_encode_baud_rate(termios, uap->fixed_baud, uap->fixed_baud); /* The SBSA UART only supports 8n1 without hardware flow control. */ termios->c_cflag &= ~(CSIZE | CSTOPB | PARENB | PARODD); termios->c_cflag &= ~(CMSPAR | CRTSCTS); termios->c_cflag |= CS8 | CLOCAL; uart_port_lock_irqsave(port, &flags); uart_update_timeout(port, CS8, uap->fixed_baud); pl011_setup_status_masks(port, termios); uart_port_unlock_irqrestore(port, flags); } static const char *pl011_type(struct uart_port *port) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); return uap->port.type == PORT_AMBA ? uap->type : NULL; } /* * Configure/autoconfigure the port. */ static void pl011_config_port(struct uart_port *port, int flags) { if (flags & UART_CONFIG_TYPE) port->type = PORT_AMBA; } /* * verify the new serial_struct (for TIOCSSERIAL). */ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser) { int ret = 0; if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA) ret = -EINVAL; if (ser->irq < 0 || ser->irq >= irq_get_nr_irqs()) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; if (port->mapbase != (unsigned long)ser->iomem_base) ret = -EINVAL; return ret; } static int pl011_rs485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); if (port->rs485.flags & SER_RS485_ENABLED) pl011_rs485_tx_stop(uap); /* Make sure auto RTS is disabled */ if (rs485->flags & SER_RS485_ENABLED) { u32 cr = pl011_read(uap, REG_CR); cr &= ~UART011_CR_RTSEN; pl011_write(cr, uap, REG_CR); port->status &= ~UPSTAT_AUTORTS; } return 0; } static const struct uart_ops amba_pl011_pops = { .tx_empty = pl011_tx_empty, .set_mctrl = pl011_set_mctrl, .get_mctrl = pl011_get_mctrl, .stop_tx = pl011_stop_tx, .start_tx = pl011_start_tx, .stop_rx = pl011_stop_rx, .throttle = pl011_throttle_rx, .unthrottle = pl011_unthrottle_rx, .enable_ms = pl011_enable_ms, .break_ctl = pl011_break_ctl, .startup = pl011_startup, .shutdown = pl011_shutdown, .flush_buffer = pl011_dma_flush_buffer, .set_termios = pl011_set_termios, .type = pl011_type, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_init = pl011_hwinit, .poll_get_char = pl011_get_poll_char, .poll_put_char = pl011_put_poll_char, #endif }; static void sbsa_uart_set_mctrl(struct uart_port *port, unsigned int mctrl) { } static unsigned int sbsa_uart_get_mctrl(struct uart_port *port) { return 0; } static const struct uart_ops sbsa_uart_pops = { .tx_empty = pl011_tx_empty, .set_mctrl = sbsa_uart_set_mctrl, .get_mctrl = sbsa_uart_get_mctrl, .stop_tx = pl011_stop_tx, .start_tx = pl011_start_tx, .stop_rx = pl011_stop_rx, .startup = sbsa_uart_startup, .shutdown = sbsa_uart_shutdown, .set_termios = sbsa_uart_set_termios, .type = pl011_type, .config_port = pl011_config_port, .verify_port = pl011_verify_port, #ifdef CONFIG_CONSOLE_POLL .poll_init = pl011_hwinit, .poll_get_char = pl011_get_poll_char, .poll_put_char = pl011_put_poll_char, #endif }; static struct uart_amba_port *amba_ports[UART_NR]; #ifdef CONFIG_SERIAL_AMBA_PL011_CONSOLE static void pl011_console_putchar(struct uart_port *port, unsigned char ch) { struct uart_amba_port *uap = container_of(port, struct uart_amba_port, port); while (pl011_read(uap, REG_FR) & UART01x_FR_TXFF) cpu_relax(); pl011_write(ch, uap, REG_DR); } static void pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; unsigned long flags; int locked = 1; clk_enable(uap->clk); if (oops_in_progress) locked = uart_port_trylock_irqsave(&uap->port, &flags); else uart_port_lock_irqsave(&uap->port, &flags); /* * First save the CR then disable the interrupts */ if (!uap->vendor->always_enabled) { old_cr = pl011_read(uap, REG_CR); new_cr = old_cr & ~UART011_CR_CTSEN; new_cr |= UART01x_CR_UARTEN | UART011_CR_TXE; pl011_write(new_cr, uap, REG_CR); } uart_console_write(&uap->port, s, count, pl011_console_putchar); /* * Finally, wait for transmitter to become empty and restore the * TCR. Allow feature register bits to be inverted to work around * errata. */ while ((pl011_read(uap, REG_FR) ^ uap->vendor->inv_fr) & uap->vendor->fr_busy) cpu_relax(); if (!uap->vendor->always_enabled) pl011_write(old_cr, uap, REG_CR); if (locked) uart_port_unlock_irqrestore(&uap->port, flags); clk_disable(uap->clk); } static void pl011_console_get_options(struct uart_amba_port *uap, int *baud, int *parity, int *bits) { unsigned int lcr_h, ibrd, fbrd; if (!(pl011_read(uap, REG_CR) & UART01x_CR_UARTEN)) return; lcr_h = pl011_read(uap, REG_LCRH_TX); *parity = 'n'; if (lcr_h & UART01x_LCRH_PEN) { if (lcr_h & UART01x_LCRH_EPS) *parity = 'e'; else *parity = 'o'; } if ((lcr_h & 0x60) == UART01x_LCRH_WLEN_7) *bits = 7; else *bits = 8; ibrd = pl011_read(uap, REG_IBRD); fbrd = pl011_read(uap, REG_FBRD); *baud = uap->port.uartclk * 4 / (64 * ibrd + fbrd); if (uap->vendor->oversampling && (pl011_read(uap, REG_CR) & ST_UART011_CR_OVSFACT)) *baud *= 2; } static int pl011_console_setup(struct console *co, char *options) { struct uart_amba_port *uap; int baud = 38400; int bits = 8; int parity = 'n'; int flow = 'n'; int ret; /* * Check whether an invalid uart number has been specified, and * if so, search for the first available port that does have * console support. */ if (co->index >= UART_NR) co->index = 0; uap = amba_ports[co->index]; if (!uap) return -ENODEV; /* Allow pins to be muxed in and configured */ pinctrl_pm_select_default_state(uap->port.dev); ret = clk_prepare(uap->clk); if (ret) return ret; if (dev_get_platdata(uap->port.dev)) { struct amba_pl011_data *plat; plat = dev_get_platdata(uap->port.dev); if (plat->init) plat->init(); } uap->port.uartclk = clk_get_rate(uap->clk); if (uap->vendor->fixed_options) { baud = uap->fixed_baud; } else { if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else pl011_console_get_options(uap, &baud, &parity, &bits); } return uart_set_options(&uap->port, co, baud, parity, bits, flow); } /** * pl011_console_match - non-standard console matching * @co: registering console * @name: name from console command line * @idx: index from console command line * @options: ptr to option string from console command line * * Only attempts to match console command lines of the form: * console=pl011,mmio|mmio32,<addr>[,<options>] * console=pl011,0x<addr>[,<options>] * This form is used to register an initial earlycon boot console and * replace it with the amba_console at pl011 driver init. * * Performs console setup for a match (as required by interface) * If no <options> are specified, then assume the h/w is already setup. * * Returns 0 if console matches; otherwise non-zero to use default matching */ static int pl011_console_match(struct console *co, char *name, int idx, char *options) { unsigned char iotype; resource_size_t addr; int i; /* * Systems affected by the Qualcomm Technologies QDF2400 E44 erratum * have a distinct console name, so make sure we check for that. * The actual implementation of the erratum occurs in the probe * function. */ if ((strcmp(name, "qdf2400_e44") != 0) && (strcmp(name, "pl011") != 0)) return -ENODEV; if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; if (iotype != UPIO_MEM && iotype != UPIO_MEM32) return -ENODEV; /* try to match the port specified on the command line */ for (i = 0; i < ARRAY_SIZE(amba_ports); i++) { struct uart_port *port; if (!amba_ports[i]) continue; port = &amba_ports[i]->port; if (port->mapbase != addr) continue; co->index = i; uart_port_set_cons(port, co); return pl011_console_setup(co, options); } return -ENODEV; } static struct uart_driver amba_reg; static struct console amba_console = { .name = "ttyAMA", .write = pl011_console_write, .device = uart_console_device, .setup = pl011_console_setup, .match = pl011_console_match, .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &amba_reg, }; #define AMBA_CONSOLE (&amba_console) static void qdf2400_e44_putc(struct uart_port *port, unsigned char c) { while (readl(port->membase + UART01x_FR) & UART01x_FR_TXFF) cpu_relax(); writel(c, port->membase + UART01x_DR); while (!(readl(port->membase + UART01x_FR) & UART011_FR_TXFE)) cpu_relax(); } static void qdf2400_e44_early_write(struct console *con, const char *s, unsigned int n) { struct earlycon_device *dev = con->data; uart_console_write(&dev->port, s, n, qdf2400_e44_putc); } static void pl011_putc(struct uart_port *port, unsigned char c) { while (readl(port->membase + UART01x_FR) & UART01x_FR_TXFF) cpu_relax(); if (port->iotype == UPIO_MEM32) writel(c, port->membase + UART01x_DR); else writeb(c, port->membase + UART01x_DR); while (readl(port->membase + UART01x_FR) & UART01x_FR_BUSY) cpu_relax(); } static void pl011_early_write(struct console *con, const char *s, unsigned int n) { struct earlycon_device *dev = con->data; uart_console_write(&dev->port, s, n, pl011_putc); } #ifdef CONFIG_CONSOLE_POLL static int pl011_getc(struct uart_port *port) { if (readl(port->membase + UART01x_FR) & UART01x_FR_RXFE) return NO_POLL_CHAR; if (port->iotype == UPIO_MEM32) return readl(port->membase + UART01x_DR); else return readb(port->membase + UART01x_DR); } static int pl011_early_read(struct console *con, char *s, unsigned int n) { struct earlycon_device *dev = con->data; int ch, num_read = 0; while (num_read < n) { ch = pl011_getc(&dev->port); if (ch == NO_POLL_CHAR) break; s[num_read++] = ch; } return num_read; } #else #define pl011_early_read NULL #endif /* * On non-ACPI systems, earlycon is enabled by specifying * "earlycon=pl011,<address>" on the kernel command line. * * On ACPI ARM64 systems, an "early" console is enabled via the SPCR table, * by specifying only "earlycon" on the command line. Because it requires * SPCR, the console starts after ACPI is parsed, which is later than a * traditional early console. * * To get the traditional early console that starts before ACPI is parsed, * specify the full "earlycon=pl011,<address>" option. */ static int __init pl011_early_console_setup(struct earlycon_device *device, const char *opt) { if (!device->port.membase) return -ENODEV; device->con->write = pl011_early_write; device->con->read = pl011_early_read; return 0; } OF_EARLYCON_DECLARE(pl011, "arm,pl011", pl011_early_console_setup); OF_EARLYCON_DECLARE(pl011, "arm,sbsa-uart", pl011_early_console_setup); /* * On Qualcomm Datacenter Technologies QDF2400 SOCs affected by * Erratum 44, traditional earlycon can be enabled by specifying * "earlycon=qdf2400_e44,<address>". Any options are ignored. * * Alternatively, you can just specify "earlycon", and the early console * will be enabled with the information from the SPCR table. In this * case, the SPCR code will detect the need for the E44 work-around, * and set the console name to "qdf2400_e44". */ static int __init qdf2400_e44_early_console_setup(struct earlycon_device *device, const char *opt) { if (!device->port.membase) return -ENODEV; device->con->write = qdf2400_e44_early_write; return 0; } EARLYCON_DECLARE(qdf2400_e44, qdf2400_e44_early_console_setup); #else #define AMBA_CONSOLE NULL #endif static struct uart_driver amba_reg = { .owner = THIS_MODULE, .driver_name = "ttyAMA", .dev_name = "ttyAMA", .major = SERIAL_AMBA_MAJOR, .minor = SERIAL_AMBA_MINOR, .nr = UART_NR, .cons = AMBA_CONSOLE, }; static int pl011_probe_dt_alias(int index, struct device *dev) { struct device_node *np; static bool seen_dev_with_alias; static bool seen_dev_without_alias; int ret = index; if (!IS_ENABLED(CONFIG_OF)) return ret; np = dev->of_node; if (!np) return ret; ret = of_alias_get_id(np, "serial"); if (ret < 0) { seen_dev_without_alias = true; ret = index; } else { seen_dev_with_alias = true; if (ret >= ARRAY_SIZE(amba_ports) || amba_ports[ret]) { dev_warn(dev, "requested serial port %d not available.\n", ret); ret = index; } } if (seen_dev_with_alias && seen_dev_without_alias) dev_warn(dev, "aliased and non-aliased serial devices found in device tree. Serial port enumeration may be unpredictable.\n"); return ret; } /* unregisters the driver also if no more ports are left */ static void pl011_unregister_port(struct uart_amba_port *uap) { int i; bool busy = false; for (i = 0; i < ARRAY_SIZE(amba_ports); i++) { if (amba_ports[i] == uap) amba_ports[i] = NULL; else if (amba_ports[i]) busy = true; } pl011_dma_remove(uap); if (!busy) uart_unregister_driver(&amba_reg); } static int pl011_find_free_port(void) { int i; for (i = 0; i < ARRAY_SIZE(amba_ports); i++) if (!amba_ports[i]) return i; return -EBUSY; } static int pl011_setup_port(struct device *dev, struct uart_amba_port *uap, struct resource *mmiobase, int index) { void __iomem *base; int ret; base = devm_ioremap_resource(dev, mmiobase); if (IS_ERR(base)) return PTR_ERR(base); index = pl011_probe_dt_alias(index, dev); uap->port.dev = dev; uap->port.mapbase = mmiobase->start; uap->port.membase = base; uap->port.fifosize = uap->fifosize; uap->port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_AMBA_PL011_CONSOLE); uap->port.flags = UPF_BOOT_AUTOCONF; uap->port.line = index; ret = uart_get_rs485_mode(&uap->port); if (ret) return ret; amba_ports[index] = uap; return 0; } static int pl011_register_port(struct uart_amba_port *uap) { int ret, i; /* Ensure interrupts from this UART are masked and cleared */ pl011_write(0, uap, REG_IMSC); pl011_write(0xffff, uap, REG_ICR); if (!amba_reg.state) { ret = uart_register_driver(&amba_reg); if (ret < 0) { dev_err(uap->port.dev, "Failed to register AMBA-PL011 driver\n"); for (i = 0; i < ARRAY_SIZE(amba_ports); i++) if (amba_ports[i] == uap) amba_ports[i] = NULL; return ret; } } ret = uart_add_one_port(&amba_reg, &uap->port); if (ret) pl011_unregister_port(uap); return ret; } static const struct serial_rs485 pl011_rs485_supported = { .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND | SER_RS485_RX_DURING_TX, .delay_rts_before_send = 1, .delay_rts_after_send = 1, }; static int pl011_probe(struct amba_device *dev, const struct amba_id *id) { struct uart_amba_port *uap; struct vendor_data *vendor = id->data; int portnr, ret; u32 val; portnr = pl011_find_free_port(); if (portnr < 0) return portnr; uap = devm_kzalloc(&dev->dev, sizeof(struct uart_amba_port), GFP_KERNEL); if (!uap) return -ENOMEM; uap->clk = devm_clk_get(&dev->dev, NULL); if (IS_ERR(uap->clk)) return PTR_ERR(uap->clk); uap->reg_offset = vendor->reg_offset; uap->vendor = vendor; uap->fifosize = vendor->get_fifosize(dev); uap->port.iotype = vendor->access_32b ? UPIO_MEM32 : UPIO_MEM; uap->port.irq = dev->irq[0]; uap->port.ops = &amba_pl011_pops; uap->port.rs485_config = pl011_rs485_config; uap->port.rs485_supported = pl011_rs485_supported; snprintf(uap->type, sizeof(uap->type), "PL011 rev%u", amba_rev(dev)); if (device_property_read_u32(&dev->dev, "reg-io-width", &val) == 0) { switch (val) { case 1: uap->port.iotype = UPIO_MEM; break; case 4: uap->port.iotype = UPIO_MEM32; break; default: dev_warn(&dev->dev, "unsupported reg-io-width (%d)\n", val); return -EINVAL; } } hrtimer_init(&uap->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&uap->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); uap->trigger_start_tx.function = pl011_trigger_start_tx; uap->trigger_stop_tx.function = pl011_trigger_stop_tx; ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr); if (ret) return ret; amba_set_drvdata(dev, uap); return pl011_register_port(uap); } static void pl011_remove(struct amba_device *dev) { struct uart_amba_port *uap = amba_get_drvdata(dev); uart_remove_one_port(&amba_reg, &uap->port); pl011_unregister_port(uap); } #ifdef CONFIG_PM_SLEEP static int pl011_suspend(struct device *dev) { struct uart_amba_port *uap = dev_get_drvdata(dev); if (!uap) return -EINVAL; return uart_suspend_port(&amba_reg, &uap->port); } static int pl011_resume(struct device *dev) { struct uart_amba_port *uap = dev_get_drvdata(dev); if (!uap) return -EINVAL; return uart_resume_port(&amba_reg, &uap->port); } #endif static SIMPLE_DEV_PM_OPS(pl011_dev_pm_ops, pl011_suspend, pl011_resume); #ifdef CONFIG_ACPI_SPCR_TABLE static void qpdf2400_erratum44_workaround(struct device *dev, struct uart_amba_port *uap) { if (!qdf2400_e44_present) return; dev_info(dev, "working around QDF2400 SoC erratum 44\n"); uap->vendor = &vendor_qdt_qdf2400_e44; } #else static void qpdf2400_erratum44_workaround(struct device *dev, struct uart_amba_port *uap) { /* empty */ } #endif static int sbsa_uart_probe(struct platform_device *pdev) { struct uart_amba_port *uap; struct resource *r; int portnr, ret; int baudrate; /* * Check the mandatory baud rate parameter in the DT node early * so that we can easily exit with the error. */ if (pdev->dev.of_node) { struct device_node *np = pdev->dev.of_node; ret = of_property_read_u32(np, "current-speed", &baudrate); if (ret) return ret; } else { baudrate = 115200; } portnr = pl011_find_free_port(); if (portnr < 0) return portnr; uap = devm_kzalloc(&pdev->dev, sizeof(struct uart_amba_port), GFP_KERNEL); if (!uap) return -ENOMEM; ret = platform_get_irq(pdev, 0); if (ret < 0) return ret; uap->port.irq = ret; uap->vendor = &vendor_sbsa; qpdf2400_erratum44_workaround(&pdev->dev, uap); uap->reg_offset = uap->vendor->reg_offset; uap->fifosize = 32; uap->port.iotype = uap->vendor->access_32b ? UPIO_MEM32 : UPIO_MEM; uap->port.ops = &sbsa_uart_pops; uap->fixed_baud = baudrate; snprintf(uap->type, sizeof(uap->type), "SBSA"); r = platform_get_resource(pdev, IORESOURCE_MEM, 0); ret = pl011_setup_port(&pdev->dev, uap, r, portnr); if (ret) return ret; platform_set_drvdata(pdev, uap); return pl011_register_port(uap); } static void sbsa_uart_remove(struct platform_device *pdev) { struct uart_amba_port *uap = platform_get_drvdata(pdev); uart_remove_one_port(&amba_reg, &uap->port); pl011_unregister_port(uap); } static const struct of_device_id sbsa_uart_of_match[] = { { .compatible = "arm,sbsa-uart", }, {}, }; MODULE_DEVICE_TABLE(of, sbsa_uart_of_match); static const struct acpi_device_id __maybe_unused sbsa_uart_acpi_match[] = { { "ARMH0011", 0 }, { "ARMHB000", 0 }, {}, }; MODULE_DEVICE_TABLE(acpi, sbsa_uart_acpi_match); static struct platform_driver arm_sbsa_uart_platform_driver = { .probe = sbsa_uart_probe, .remove = sbsa_uart_remove, .driver = { .name = "sbsa-uart", .pm = &pl011_dev_pm_ops, .of_match_table = of_match_ptr(sbsa_uart_of_match), .acpi_match_table = ACPI_PTR(sbsa_uart_acpi_match), .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, }; static const struct amba_id pl011_ids[] = { { .id = 0x00041011, .mask = 0x000fffff, .data = &vendor_arm, }, { .id = 0x00380802, .mask = 0x00ffffff, .data = &vendor_st, }, { 0, 0 }, }; MODULE_DEVICE_TABLE(amba, pl011_ids); static struct amba_driver pl011_driver = { .drv = { .name = "uart-pl011", .pm = &pl011_dev_pm_ops, .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, .id_table = pl011_ids, .probe = pl011_probe, .remove = pl011_remove, }; static int __init pl011_init(void) { pr_info("Serial: AMBA PL011 UART driver\n"); if (platform_driver_register(&arm_sbsa_uart_platform_driver)) pr_warn("could not register SBSA UART platform driver\n"); return amba_driver_register(&pl011_driver); } static void __exit pl011_exit(void) { platform_driver_unregister(&arm_sbsa_uart_platform_driver); amba_driver_unregister(&pl011_driver); } /* * While this can be a module, if builtin it's most likely the console * So let's leave module_exit but move module_init to an earlier place */ arch_initcall(pl011_init); module_exit(pl011_exit); MODULE_AUTHOR("ARM Ltd/Deep Blue Solutions Ltd"); MODULE_DESCRIPTION("ARM AMBA serial port driver"); MODULE_LICENSE("GPL"); |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 | /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); } |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; struct net_device *pdev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (ndev->ifindex != bound_if_index) { pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); if (pdev) { if (is_vlan_dev(pdev)) { pdev = vlan_dev_real_dev(pdev); if (ndev->ifindex == pdev->ifindex) bound_if_index = pdev->ifindex; } if (is_vlan_dev(ndev)) { pdev = vlan_dev_real_dev(ndev); if (bound_if_index == pdev->ifindex) bound_if_index = ndev->ifindex; } } } if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) { rdma_put_gid_attr(sgid_attr); sgid_attr = ERR_PTR(-ENODEV); } rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, ¤t_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup); |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 | // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; u16 weight; int i; *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { weight = nhg->nh_entries[i].weight - 1; *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, .weight = weight, .weight_high = weight >> 8, }; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh) + nla_total_size(4) + /* NHA_OP_FLAGS */ 0; else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } if (nexthop_grp_weight(&nhg[i]) == 0) { /* 0xffff got passed in, representing weight of 0x10000, * which is too heavy. */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { u16 prev_upper_bound = 0; u32 total = 0; u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u16 upper_bound; u64 btw; w += nhge->weight; btw = ((u64)res_table->num_nh_buckets) * w; upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { u32 total = 0; u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register_many(nexthop_rtnl_msg_handlers); return 0; } subsys_initcall(nexthop_init); |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> /* * To send multiple CAN frame content within TX_SETUP or to filter * CAN messages with multiplex index within RX_SETUP, the number of * different filters is limited to 256 due to the one byte index value. */ #define MAX_NFRAMES 256 /* limit timers to 400 days for sending/timeouts */ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ #define RX_LOCAL 0x10 /* frame was created on the local host */ #define RX_OWN 0x20 /* frame was sent via the socket it was received on */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ #define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS("can-proto-2"); #define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) /* * easy access to the first 64 bit of can(fd)_frame payload. cp->data is * 64 bit aligned so the offset has to be multiples of 8 which is ensured * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). */ static inline u64 get_u64(const struct canfd_frame *cp, int offset) { return *(u64 *)(cp->data + offset); } struct bcm_op { struct list_head list; struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; u32 count; u32 nframes; u32 currframe; /* void pointers to arrays of struct can[fd]_frame */ void *frames; void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; struct bcm_sock { struct sock sk; int bound; int ifindex; struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; char procname [32]; /* inode number in decimal with \0 */ }; static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; /* Return pointer to store the extra msg flags for bcm_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *bcm_flags(struct sk_buff *skb) { /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; } static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) { return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } /* check limitations for timeval provided by user */ static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) { if ((msg_head->ival1.tv_sec < 0) || (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival1.tv_usec < 0) || (msg_head->ival1.tv_usec >= USEC_PER_SEC) || (msg_head->ival2.tv_sec < 0) || (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival2.tv_usec < 0) || (msg_head->ival2.tv_usec >= USEC_PER_SEC)) return true; return false; } #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) /* * procfs functions */ #if IS_ENABLED(CONFIG_PROC_FS) static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else strcpy(result, "???"); rcu_read_unlock(); return result; } static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; seq_printf(m, ">>> socket %pK", sk->sk_socket); seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { unsigned long reduction; /* print only active entries & prevent division by zero */ if (!op->frames_abs) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); else seq_printf(m, "[%u]", op->nframes); seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "thr=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); else seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1) seq_printf(m, "t1=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "t2=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); return 0; } #endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface * of the given bcm tx op */ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; int err; /* no target device? => exit */ if (!op->ifindex) return; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; } skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_put_data(skb, cf, op->cfsiz); /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); if (!err) op->frames_abs++; op->currframe++; /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; out: dev_put(dev); } /* * bcm_send_to_user - send a BCM message to the userspace * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; unsigned int *pflags; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) return; skb_put_data(skb, head, sizeof(*head)); /* ensure space for sockaddr_can and msg flags */ sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* initialize msg flags */ pflags = bcm_flags(skb); *pflags = 0; if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); skb_put_data(skb, frames, datalen); /* * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) { if (firstframe->flags & RX_LOCAL) *pflags |= MSG_DONTROUTE; if (firstframe->flags & RX_OWN) *pflags |= MSG_CONFIRM; firstframe->flags &= BCM_CAN_FLAGS_MASK; } } if (has_timestamp) { /* restore rx timestamp */ skb->tstamp = op->rx_stamp; } /* * Put the datagram to the queue so that bcm_recvmsg() can * get it from there. We need to pass the interface index to * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. */ addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; err = sock_queue_rcv_skb(sk, skb); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); kfree_skb(skb); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } } static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { ktime_t ival; if (op->kt_ival1 && op->count) ival = op->kt_ival1; else if (op->kt_ival2) ival = op->kt_ival2; else return false; hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); return true; } static void bcm_tx_start_timer(struct bcm_op *op) { if (bcm_tx_set_expiry(op, &op->timer)) hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); } /* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); } return bcm_tx_set_expiry(op, &op->timer) ? HRTIMER_RESTART : HRTIMER_NORESTART; } /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; /* update statistics */ op->frames_filtered++; /* prevent statistics overflow */ if (op->frames_filtered > ULONG_MAX/100) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ data->flags &= ~RX_THR; memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; head.ival1 = op->ival1; head.ival2 = op->ival2; head.can_id = op->can_id; head.nframes = 1; bcm_send_to_user(op, &head, data, 1); } /* * bcm_rx_update_and_send - process a detected relevant receive content change * 1. update the last received data * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, const struct canfd_frame *rxdata, unsigned char traffic_flags) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); /* add own/local/remote traffic flags */ lastdata->flags |= traffic_flags; /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ bcm_rx_changed(op, lastdata); return; } /* with active throttling timer we are just done here */ if (hrtimer_active(&op->thrtimer)) return; /* first reception with enabled throttling mode */ if (!op->kt_lastmsg) goto rx_changed_settime; /* got a second frame inside a potential throttle period? */ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < ktime_to_us(op->kt_ival2)) { /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), HRTIMER_MODE_ABS_SOFT); return; } /* the gap was that big, that throttling was not needed here */ rx_changed_settime: bcm_rx_changed(op, lastdata); op->kt_lastmsg = ktime_get(); } /* * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct canfd_frame *rxdata, unsigned char traffic_flags) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; int i; /* * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } /* do a real check in CAN frame data section */ for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } } /* * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception */ static void bcm_rx_starttimer(struct bcm_op *op) { if (op->flags & RX_NO_AUTOTIMER) return; if (op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * op->cfsiz); } /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); return HRTIMER_NORESTART; } /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { bcm_rx_changed(op, lcf); return 1; } return 0; } /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace */ static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; if (op->nframes > 1) { unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ updated += bcm_rx_do_flush(op, 0); } return updated; } /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace */ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); if (bcm_rx_thr_flush(op)) { hrtimer_forward_now(hrtimer, op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ op->kt_lastmsg = 0; return HRTIMER_NORESTART; } } /* * bcm_rx_handler - handle a CAN frame reception */ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; unsigned char traffic_flags; if (op->can_id != rxframe->can_id) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ if (op->flags & CAN_FD_FRAME) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* disable timeout */ hrtimer_cancel(&op->timer); /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ op->rx_ifindex = skb->dev->ifindex; /* update statistics */ op->frames_abs++; if (op->flags & RX_RTR_FRAME) { /* send reply for RTR-request (placed in op->frames[0]) */ bcm_can_tx(op); return; } /* compute flags to distinguish between own/local/remote CAN traffic */ traffic_flags = 0; if (skb->sk) { traffic_flags |= RX_LOCAL; if (skb->sk == op->sk) traffic_flags |= RX_OWN; } if (op->flags & RX_FILTER_ID) { /* the easiest case */ bcm_rx_update_and_send(op, op->last_frames, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags); goto rx_starttimer; } if (op->nframes > 1) { /* * multiplex compare * * find the first multiplex mask that fits. * Remark: The MUX-mask is stored in index 0 - but only the * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe, traffic_flags); break; } } } rx_starttimer: bcm_rx_starttimer(op); } /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } return NULL; } static void bcm_free_op_rcu(struct rcu_head *rcu_head) { struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); if ((op->last_frames) && (op->last_frames != &op->last_sframe)) kfree(op->last_frames); kfree(op); } static void bcm_remove_op(struct bcm_op *op) { hrtimer_cancel(&op->timer); hrtimer_cancel(&op->thrtimer); call_rcu(&op->rcu, bcm_free_op_rcu); } static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { can_rx_unregister(dev_net(dev), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; } else printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " "mismatch %p %p\n", op->rx_reg_dev, dev); } /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* disable automatic timer on frame reception */ op->flags |= RX_NO_AUTOTIMER; /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save * thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(sock_net(op->sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) */ static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; /* put current values into msg_head */ msg_head->flags = op->flags; msg_head->count = op->count; msg_head->ival1 = op->ival1; msg_head->ival2 = op->ival2; msg_head->nframes = op->nframes; bcm_send_to_user(op, msg_head, op->frames, 0); return MHSIZ; } /* * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) */ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; struct canfd_frame *cf; unsigned int i; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } } else op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (err < 0) goto free_op; if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) goto free_op; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } /* tx_ops never compare with previous received messages */ op->last_frames = NULL; /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_tx_timeout_handler; /* currently unused in tx_ops */ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ if (op->nframes != msg_head->nframes) { op->nframes = msg_head->nframes; /* start multiple frame transmission with index 0 */ op->currframe = 0; } /* check flags */ if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; } if (op->flags & SETTIMER) { /* set timer values */ op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1 && !op->kt_ival2) hrtimer_cancel(&op->timer); } if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); if (op->count) op->count--; } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; free_op: if (op->frames != &op->sframe) kfree(op->frames); kfree(op); return err; } /* * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) */ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; int do_rx_register; int err = 0; if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { /* be robust against wrong usage ... */ msg_head->flags |= RX_FILTER_ID; /* ignore trailing garbage */ msg_head->nframes = 0; } /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ if (msg_head->nframes > MAX_NFRAMES + 1) return -EINVAL; if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ op->last_frames = kcalloc(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); kfree(op); return -ENOMEM; } } else { op->frames = &op->sframe; op->last_frames = &op->last_sframe; } if (msg_head->nframes) { err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); if (op->last_frames != &op->last_sframe) kfree(op->last_frames); kfree(op); return err; } } /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* ifindex for timeout events w/o previous frame reception */ op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_rx_timeout_handler; hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->thrtimer.function = bcm_rx_thr_handler; /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); /* call can_rx_register() */ do_rx_register = 1; } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ if (op->flags & RX_RTR_FRAME) { struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); hrtimer_cancel(&op->timer); /* * funny feature in RX(!)_SETUP only for RTR-mode: * copy can_id into frame BUT without RTR-flag to * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || (frame0->can_id == op->can_id)) frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1) hrtimer_cancel(&op->timer); /* * In any case cancel the throttle timer, flush * potentially blocked msgs and reset throttle handling */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ if (do_rx_register) { if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { err = can_rx_register(sock_net(sk), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); } } else err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); bcm_remove_op(op); return err; } } return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; struct net_device *dev; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; } can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); if (err) return err; return cfsiz + MHSIZ; } /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ if (size < MHSIZ) return -EINVAL; /* read message head information */ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; cfsiz = CFSIZ(msg_head.flags); if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < BCM_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* ifindex from sendto() */ ifindex = addr->can_ifindex; if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; if (dev->type != ARPHRD_CAN) { dev_put(dev); return -ENODEV; } dev_put(dev); } } lock_sock(sk); switch (msg_head.opcode) { case TX_SETUP: ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); break; case RX_SETUP: ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); break; case TX_DELETE: if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case TX_READ: /* reuse msg_head for the reply to TX_READ */ msg_head.opcode = TX_STATUS; ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); break; case RX_READ: /* reuse msg_head for the reply to RX_READ */ msg_head.opcode = RX_STATUS; ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); break; case TX_SEND: /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: ret = -EINVAL; break; } release_sock(sk); return ret; } /* * notification handler for netdevice status changes */ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, struct net_device *dev) { struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove device specific receive entries */ list_for_each_entry(op, &bo->rx_ops, list) if (op->rx_reg_dev == dev) bcm_rx_unreg(dev, op); /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { #if IS_ENABLED(CONFIG_PROC_FS) if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) { remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir); bo->bcm_proc_read = NULL; } #endif bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; } release_sock(sk); if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } break; case NETDEV_DOWN: if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } } } static int bcm_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&bcm_notifier_lock); list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { spin_unlock(&bcm_notifier_lock); bcm_notify(bcm_busy_notifier, msg, dev); spin_lock(&bcm_notifier_lock); } bcm_busy_notifier = NULL; spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } /* * initial settings for all BCM sockets to be set at socket creation time */ static int bcm_init(struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); bo->bound = 0; bo->ifindex = 0; bo->dropped_usr_msgs = 0; bo->bcm_proc_read = NULL; INIT_LIST_HEAD(&bo->tx_ops); INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ spin_lock(&bcm_notifier_lock); list_add_tail(&bo->notifier, &bcm_notifier_list); spin_unlock(&bcm_notifier_lock); return 0; } /* * standard socket functions */ static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; if (!sk) return 0; net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ spin_lock(&bcm_notifier_lock); while (bcm_busy_notifier == bo) { spin_unlock(&bcm_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&bcm_notifier_lock); } list_del(&bo->notifier); spin_unlock(&bcm_notifier_lock); lock_sock(sk); #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, net->can.bcmproc_dir); #endif /* CONFIG_PROC_FS */ list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); list_for_each_entry_safe(op, next, &bo->rx_ops, list) { /* * Don't care if we're bound or not (due to netdev problems) * can_rx_unregister() is always a save thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); } synchronize_rcu(); list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); /* remove device reference */ if (bo->bound) { bo->bound = 0; bo->ifindex = 0; } sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); struct net *net = sock_net(sk); int ret = 0; if (len < BCM_MIN_NAMELEN) return -EINVAL; lock_sock(sk); if (bo->bound) { ret = -EISCONN; goto fail; } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; } if (dev->type != ARPHRD_CAN) { dev_put(dev); ret = -ENODEV; goto fail; } bo->ifindex = dev->ifindex; dev_put(dev); } else { /* no interface reference for ifindex = 0 ('any' CAN device) */ bo->ifindex = 0; } #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644, net->can.bcmproc_dir, bcm_proc_show, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } #endif /* CONFIG_PROC_FS */ bo->bound = 1; fail: release_sock(sk); return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; int err; skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; if (skb->len < size) size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(BCM_MIN_NAMELEN); msg->msg_namelen = BCM_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in bcm_send_to_user() */ msg->msg_flags |= *(bcm_flags(skb)); skb_free_datagram(sk, skb); return size; } static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, .connect = bcm_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = bcm_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, }; static struct proto bcm_proto __read_mostly = { .name = "CAN_BCM", .owner = THIS_MODULE, .obj_size = sizeof(struct bcm_sock), .init = bcm_init, }; static const struct can_proto bcm_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_BCM, .ops = &bcm_ops, .prot = &bcm_proto, }; static int canbcm_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* create /proc/net/can-bcm directory */ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ return 0; } static void canbcm_pernet_exit(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* remove /proc/net/can-bcm directory */ if (net->can.bcmproc_dir) remove_proc_entry("can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ } static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, }; static struct notifier_block canbcm_notifier = { .notifier_call = bcm_notifier }; static int __init bcm_module_init(void) { int err; pr_info("can: broadcast manager protocol\n"); err = register_pernet_subsys(&canbcm_pernet_ops); if (err) return err; err = register_netdevice_notifier(&canbcm_notifier); if (err) goto register_notifier_failed; err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canbcm_notifier); register_notifier_failed: unregister_pernet_subsys(&canbcm_pernet_ops); return err; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); module_exit(bcm_module_exit); |
3 438 438 298 5 297 298 267 267 266 267 266 1229 25 25 25 590 589 7 592 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 | // SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> #include <linux/random.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; } ____cacheline_aligned; static struct tk_data tk_core; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_latch_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { if (timekeeping_suspended) return cycles_at_suspend; return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; /* * Boot time initialization which allows local_clock() to be utilized * during early boot when clocksources are not available. local_clock() * returns nanoseconds already so no conversion is required, hence mult=1 * and shift=0. When the first proper clocksource is installed then * the fast time keepers are updated with the correct values. */ #define FAST_TK_INIT \ { \ .clock = &dummy_clock, \ .mask = CLOCKSOURCE_MASK(64), \ .mult = 1, \ .shift = 0, \ } static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is * earlier than that value. * * mg_floor represents the latest fine-grained time that has been handed out as * a file timestamp on the system. This is tracked as a monotonic ktime_t, and * converted to a realtime clock value on an as-needed basis. * * Maintaining mg_floor ensures the multigrain interfaces never issue a * timestamp earlier than one that has been previously issued. * * The exception to this rule is when there is a backward realtime clock jump. If * such an event occurs, a timestamp can appear to be earlier than a previous one. */ static __cacheline_aligned_in_smp atomic64_t mg_floor; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* * This detects both negative motion and the case where the delta * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { /* * Handle clocksource inconsistency between CPUs to prevent * time from going backwards by checking for the MSB of the * mask being set in the delta. */ if (delta & ~(mask >> 1)) return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /** * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw * * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /** * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. * * The same limitations as described for ktime_get_boot_fast_ns() apply. The * mono time and the TAI offset are not read atomically which may yield wrong * readouts. However, an update of the TAI offset is an rare event e.g., caused * by settime or adjtimex with an offset. The user of this function has to deal * with the possibility of wrong timestamps in post processing. */ u64 notrace ktime_get_tai_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return baser + delta; } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; int ret; guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { guard(raw_spinlock_irqsave)(&tk_core.lock); return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Leap state update for both shadow and the real timekeeper * Separate to spare a full memcpy() of the timekeeper. */ static void tk_update_leap_state_all(struct tk_data *tkd) { write_seqcount_begin(&tkd->seq); tk_update_leap_state(&tkd->shadow_timekeeper); tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; write_seqcount_end(&tkd->seq); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* * Restore the shadow timekeeper from the real timekeeper. */ static void timekeeping_restore_shadow(struct tk_data *tkd) { lockdep_assert_held(&tkd->lock); memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { struct timekeeper *tk = &tk_core.shadow_timekeeper; lockdep_assert_held(&tkd->lock); /* * Block out readers before running the updates below because that * updates VDSO and other time related infrastructure. Not blocking * the readers might let a reader see time going backwards when * reading from the VDSO after the VDSO update and then reading in * the kernel from the timekeeper before that got updated. */ write_seqcount_begin(&tkd->seq); if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * Update the real timekeeper. * * We could avoid this memcpy() by switching pointers, but that has * the downside that the reader side does not longer benefit from * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } /** * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; while (delta > 0) { u64 max = tk->tkr_mono.clock->max_cycles; u64 incr = delta < max ? delta : max; tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; tk_normalize_xtime(tk); delta -= incr; } } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; if (IS_ENABLED(CONFIG_64BIT)) { /* * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and * tk_update_sleep_time(). */ return ktime_add(tmono, READ_ONCE(*offset)); } do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - The same as ktime_get_real_seconds * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base_raw; ktime_t base_real; ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_boot = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * timestamp_in_interval - true if ts is chronologically in [start, end] * * True if ts occurs chronologically at or after start, and before or at end. */ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { if (ts >= start && ts <= end) return true; if (start > end && (ts >= start || ts <= end)) return true; return false; } static bool convert_clock(u64 *val, u32 numerator, u32 denominator) { u64 rem, res; if (!numerator || !denominator) return false; res = div64_u64_rem(*val, denominator, &rem) * numerator; *val = res + div_u64(rem * numerator, denominator); return true; } static bool convert_base_to_cs(struct system_counterval_t *scv) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; u32 num, den; /* The timestamp was taken from the time keeper clock source */ if (cs->id == scv->cs_id) return true; /* * Check whether cs_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != scv->cs_id) return false; num = scv->use_nsecs ? cs->freq_khz : base->numerator; den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; if (!convert_clock(&scv->cycles, num, den)) return false; scv->cycles += base->offset; return true; } static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; /* * Check whether base_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != base_id) return false; *cycles -= base->offset; if (!convert_clock(cycles, base->denominator, base->numerator)) return false; return true; } static bool convert_ns_to_cs(u64 *delta) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) return false; *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); return true; } /** * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp * @treal: CLOCK_REALTIME timestamp to convert * @base_id: base clocksource id * @cycles: pointer to store the converted base clock timestamp * * Converts a supplied, future realtime clock value to the corresponding base clock value. * * Return: true if the conversion is successful, false otherwise. */ bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 delta; do { seq = read_seqcount_begin(&tk_core.seq); if ((u64)treal < tk->tkr_mono.base_real) return false; delta = (u64)treal - tk->tkr_mono.base_real; if (!convert_ns_to_cs(&delta)) return false; *cycles = tk->tkr_mono.cycle_last + delta; if (!convert_cs_to_base(cycles, base_id)) return false; } while (read_seqcount_retry(&tk_core.seq, seq)); return true; } EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned int seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !timestamp_in_interval(history_begin->cycles, cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * timekeeping_clocksource_has_base - Check whether the current clocksource * is based on given a base clock * @id: base clocksource ID * * Note: The return value is a snapshot which can become invalid right * after the function returns. * * Return: true if the timekeeper clocksource has a base clock with @id, * false otherwise */ bool timekeeping_clocksource_has_base(enum clocksource_ids id) { /* * This is a snapshot, so no point in using the sequence * count. Just prevent the compiler from re-evaluating @base as the * clocksource might change concurrently. */ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); return base ? base->id == id : false; } EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timespec64 ts_delta, xt; if (!timespec64_valid_settod(ts)) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); xt = tk_xtime(tks); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); tk_set_xtime(tks, ts); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); audit_tk_injoffset(ts_delta); add_device_randomness(ts, sizeof(*ts)); return 0; } EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int timekeeping_inject_offset(const struct timespec64 *ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 tmp; timekeeping_forward_now(tks); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); return 0; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct clocksource *new = data, *old = NULL; /* * If the clocksource is in a module, get a module reference. * Succeeds for built-in code (owner == NULL) as well. Abort if the * reference can't be acquired. */ if (!try_module_get(new->owner)) return 0; /* Abort if the device can't be enabled */ if (new->enable && new->enable(new) != 0) { module_put(new->owner); return 0; } scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); old = tks->tkr_mono.clock; tk_setup_internals(tks, new); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } if (old) { if (old->disable) old->disable(old); module_put(old->owner); } return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock64 - Return time from the persistent clock. * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * @wall_time: current time as returned by persistent clock * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } static __init void tkd_basic_setup(struct tk_data *tkd) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tks, clock); tk_set_xtime(tks, &wall_time); tks->raw_sec = 0; tk_set_wall_to_mono(tks, wall_to_mono); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @tk: Pointer to the timekeeper to be updated * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; suspend_timing_needed = false; timekeeping_forward_now(tks); __timekeeping_inject_sleeptime(tks, delta); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; bool inject_sleeptime = false; u64 cycle_now, nsec; unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ tks->tkr_mono.cycle_last = cycle_now; tks->tkr_raw.cycle_last = cycle_now; tks->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); /* Resume the clockevent device(s) and hrtimers */ tick_resume(); /* Notify timerfd as resume is equivalent to clock_was_set() */ timerfd_resume(); } int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); timekeeping_forward_now(tks); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tks->tkr_mono.clock; cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update_from_shadow(&tk_core, 0); halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; } /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); /** * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor * @ts: timespec64 to be filled * * Fetch the global mg_floor value, convert it to realtime and compare it * to the current coarse-grained time. Fill @ts with whichever is * latest. Note that this is a filesystem-specific interface and should be * avoided outside of that context. */ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 floor = atomic64_read(&mg_floor); ktime_t f_real, offset, coarse; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); coarse = timespec64_to_ktime(*ts); f_real = ktime_add(floor, offset); if (ktime_after(f_real, coarse)) *ts = ktime_to_timespec64(f_real); } /** * ktime_get_real_ts64_mg - attempt to update floor value and return result * @ts: pointer to the timespec to be set * * Get a monotonic fine-grained time value and attempt to swap it into * mg_floor. If that succeeds then accept the new floor value. If it fails * then another task raced in during the interim time and updated the * floor. Since any update to the floor must be later than the previous * floor, either outcome is acceptable. * * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), * and determining that the resulting coarse-grained timestamp did not effect * a change in ctime. Any more recent floor value would effect a change to * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. * * @ts will be filled with the latest floor value, regardless of the outcome of * the cmpxchg. Note that this is a filesystem specific interface and should be * avoided outside of that context. */ void ktime_get_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t old = atomic64_read(&mg_floor); ktime_t offset, mono; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; mono = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); mono = ktime_add_ns(mono, nsecs); /* * Attempt to update the floor with the new time value. As any * update must be later then the existing floor, and would effect * a change to ctime from the perspective of the current task, * accept the resulting floor value regardless of the outcome of * the swap. */ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. * "old" has been updated with the latest value of "mg_floor". * That value is newer than the previous floor value, which * is enough to effect a change to ctime. Accept it. */ *ts = ktime_to_timespec64(ktime_add(old, offset)); } } void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; struct timespec64 ts; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; ret = timekeeping_inject_offset(&delta); if (ret) return ret; offset_set = delta.tv_sec != 0; audit_tk_injoffset(delta); } audit_ntp_init(&ad); ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; s32 orig_tai, tai; orig_tai = tai = tks->tai_offset; ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state_all(&tk_core); } } audit_ntp_log(&ad); /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(offset_set); return ret; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function * @phase_ts: Pointer to timespec64 structure representing phase timestamp * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ |
9 9 9 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 9 9 9 9 9 8 9 9 9 9 9 9 9 9 9 9 9 9 9 4 4 4 68 4 69 69 68 69 68 25 69 68 69 25 45 25 42 212 211 212 212 212 211 212 212 211 211 212 180 42 212 212 212 212 212 211 303 303 303 303 303 303 303 303 303 303 303 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 | // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the security services. * * Authors : Stephen Smalley, <stephen.smalley.work@gmail.com> * James Morris <jmorris@redhat.com> * * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * Support for context based audit filters. * * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Updated: Hewlett-Packard <paul@paul-moore.com> * * Added support for NetLabel * Added support for the policy capability bitmap * * Updated: Chad Sellers <csellers@tresys.com> * * Added validation of kernel classes and permissions * * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> * * Added support for bounds domain and audit messaged on masked permissions * * Updated: Guido Trentalancia <guido@trentalancia.com> * * Added support for runtime switching of the policy type * * Copyright (C) 2008, 2009 NEC Corporation * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/sched.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/lsm_hooks.h> #include <net/netlabel.h> #include "flask.h" #include "avc.h" #include "avc_ss.h" #include "security.h" #include "context.h" #include "policydb.h" #include "sidtab.h" #include "services.h" #include "conditional.h" #include "mls.h" #include "objsec.h" #include "netlabel.h" #include "xfrm.h" #include "ebitmap.h" #include "audit.h" #include "policycap_names.h" #include "ima.h" struct selinux_policy_convert_data { struct convert_context_args args; struct sidtab_convert_params sidtab_params; }; /* Forward declaration. */ static int context_struct_to_string(struct policydb *policydb, struct context *context, char **scontext, u32 *scontext_len); static int sidtab_entry_to_string(struct policydb *policydb, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len); static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms); static int selinux_set_mapping(struct policydb *pol, const struct security_class_mapping *map, struct selinux_map *out_map) { u16 i, j; bool print_unknown_handle = false; /* Find number of classes in the input mapping */ if (!map) return -EINVAL; i = 0; while (map[i].name) i++; /* Allocate space for the class records, plus one for class zero */ out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC); if (!out_map->mapping) return -ENOMEM; /* Store the raw class and permission values */ j = 0; while (map[j].name) { const struct security_class_mapping *p_in = map + (j++); struct selinux_mapping *p_out = out_map->mapping + j; u16 k; /* An empty class string skips ahead */ if (!strcmp(p_in->name, "")) { p_out->num_perms = 0; continue; } p_out->value = string_to_security_class(pol, p_in->name); if (!p_out->value) { pr_info("SELinux: Class %s not defined in policy.\n", p_in->name); if (pol->reject_unknown) goto err; p_out->num_perms = 0; print_unknown_handle = true; continue; } k = 0; while (p_in->perms[k]) { /* An empty permission string skips ahead */ if (!*p_in->perms[k]) { k++; continue; } p_out->perms[k] = string_to_av_perm(pol, p_out->value, p_in->perms[k]); if (!p_out->perms[k]) { pr_info("SELinux: Permission %s in class %s not defined in policy.\n", p_in->perms[k], p_in->name); if (pol->reject_unknown) goto err; print_unknown_handle = true; } k++; } p_out->num_perms = k; } if (print_unknown_handle) pr_info("SELinux: the above unknown classes and permissions will be %s\n", pol->allow_unknown ? "allowed" : "denied"); out_map->size = i; return 0; err: kfree(out_map->mapping); out_map->mapping = NULL; return -EINVAL; } /* * Get real, policy values from mapped values */ static u16 unmap_class(struct selinux_map *map, u16 tclass) { if (tclass < map->size) return map->mapping[tclass].value; return tclass; } /* * Get kernel value for class from its policy value */ static u16 map_class(struct selinux_map *map, u16 pol_value) { u16 i; for (i = 1; i < map->size; i++) { if (map->mapping[i].value == pol_value) return i; } return SECCLASS_NULL; } static void map_decision(struct selinux_map *map, u16 tclass, struct av_decision *avd, int allow_unknown) { if (tclass < map->size) { struct selinux_mapping *mapping = &map->mapping[tclass]; unsigned int i, n = mapping->num_perms; u32 result; for (i = 0, result = 0; i < n; i++) { if (avd->allowed & mapping->perms[i]) result |= (u32)1<<i; if (allow_unknown && !mapping->perms[i]) result |= (u32)1<<i; } avd->allowed = result; for (i = 0, result = 0; i < n; i++) if (avd->auditallow & mapping->perms[i]) result |= (u32)1<<i; avd->auditallow = result; for (i = 0, result = 0; i < n; i++) { if (avd->auditdeny & mapping->perms[i]) result |= (u32)1<<i; if (!allow_unknown && !mapping->perms[i]) result |= (u32)1<<i; } /* * In case the kernel has a bug and requests a permission * between num_perms and the maximum permission number, we * should audit that denial */ for (; i < (sizeof(u32)*8); i++) result |= (u32)1<<i; avd->auditdeny = result; } } int security_mls_enabled(void) { int mls_enabled; struct selinux_policy *policy; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); mls_enabled = policy->policydb.mls_enabled; rcu_read_unlock(); return mls_enabled; } /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target * security contexts. * * xcontext is a special beast... It is used by the validatetrans rules * only. For these rules, scontext is the context before the transition, * tcontext is the context after the transition, and xcontext is the context * of the process performing the transition. All other callers of * constraint_expr_eval should pass in NULL for xcontext. */ static int constraint_expr_eval(struct policydb *policydb, struct context *scontext, struct context *tcontext, struct context *xcontext, struct constraint_expr *cexpr) { u32 val1, val2; struct context *c; struct role_datum *r1, *r2; struct mls_level *l1, *l2; struct constraint_expr *e; int s[CEXPR_MAXDEPTH]; int sp = -1; for (e = cexpr; e; e = e->next) { switch (e->expr_type) { case CEXPR_NOT: BUG_ON(sp < 0); s[sp] = !s[sp]; break; case CEXPR_AND: BUG_ON(sp < 1); sp--; s[sp] &= s[sp + 1]; break; case CEXPR_OR: BUG_ON(sp < 1); sp--; s[sp] |= s[sp + 1]; break; case CEXPR_ATTR: if (sp == (CEXPR_MAXDEPTH - 1)) return 0; switch (e->attr) { case CEXPR_USER: val1 = scontext->user; val2 = tcontext->user; break; case CEXPR_TYPE: val1 = scontext->type; val2 = tcontext->type; break; case CEXPR_ROLE: val1 = scontext->role; val2 = tcontext->role; r1 = policydb->role_val_to_struct[val1 - 1]; r2 = policydb->role_val_to_struct[val2 - 1]; switch (e->op) { case CEXPR_DOM: s[++sp] = ebitmap_get_bit(&r1->dominates, val2 - 1); continue; case CEXPR_DOMBY: s[++sp] = ebitmap_get_bit(&r2->dominates, val1 - 1); continue; case CEXPR_INCOMP: s[++sp] = (!ebitmap_get_bit(&r1->dominates, val2 - 1) && !ebitmap_get_bit(&r2->dominates, val1 - 1)); continue; default: break; } break; case CEXPR_L1L2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_L1H2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_H1L2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_H1H2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_L1H1: l1 = &(scontext->range.level[0]); l2 = &(scontext->range.level[1]); goto mls_ops; case CEXPR_L2H2: l1 = &(tcontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; mls_ops: switch (e->op) { case CEXPR_EQ: s[++sp] = mls_level_eq(l1, l2); continue; case CEXPR_NEQ: s[++sp] = !mls_level_eq(l1, l2); continue; case CEXPR_DOM: s[++sp] = mls_level_dom(l1, l2); continue; case CEXPR_DOMBY: s[++sp] = mls_level_dom(l2, l1); continue; case CEXPR_INCOMP: s[++sp] = mls_level_incomp(l2, l1); continue; default: BUG(); return 0; } break; default: BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = (val1 == val2); break; case CEXPR_NEQ: s[++sp] = (val1 != val2); break; default: BUG(); return 0; } break; case CEXPR_NAMES: if (sp == (CEXPR_MAXDEPTH-1)) return 0; c = scontext; if (e->attr & CEXPR_TARGET) c = tcontext; else if (e->attr & CEXPR_XTARGET) { c = xcontext; if (!c) { BUG(); return 0; } } if (e->attr & CEXPR_USER) val1 = c->user; else if (e->attr & CEXPR_ROLE) val1 = c->role; else if (e->attr & CEXPR_TYPE) val1 = c->type; else { BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = ebitmap_get_bit(&e->names, val1 - 1); break; case CEXPR_NEQ: s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1); break; default: BUG(); return 0; } break; default: BUG(); return 0; } } BUG_ON(sp != 0); return s[0]; } /* * security_dump_masked_av - dumps masked permissions during * security_compute_av due to RBAC, MLS/Constraint and Type bounds. */ static int dump_masked_av_helper(void *k, void *d, void *args) { struct perm_datum *pdatum = d; char **permission_names = args; BUG_ON(pdatum->value < 1 || pdatum->value > 32); permission_names[pdatum->value - 1] = (char *)k; return 0; } static void security_dump_masked_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, u32 permissions, const char *reason) { struct common_datum *common_dat; struct class_datum *tclass_dat; struct audit_buffer *ab; char *tclass_name; char *scontext_name = NULL; char *tcontext_name = NULL; char *permission_names[32]; int index; u32 length; bool need_comma = false; if (!permissions) return; tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1); tclass_dat = policydb->class_val_to_struct[tclass - 1]; common_dat = tclass_dat->comdatum; /* init permission_names */ if (common_dat && hashtab_map(&common_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; if (hashtab_map(&tclass_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; /* get scontext/tcontext in text form */ if (context_struct_to_string(policydb, scontext, &scontext_name, &length) < 0) goto out; if (context_struct_to_string(policydb, tcontext, &tcontext_name, &length) < 0) goto out; /* audit a message */ ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) goto out; audit_log_format(ab, "op=security_compute_av reason=%s " "scontext=%s tcontext=%s tclass=%s perms=", reason, scontext_name, tcontext_name, tclass_name); for (index = 0; index < 32; index++) { u32 mask = (1 << index); if ((mask & permissions) == 0) continue; audit_log_format(ab, "%s%s", need_comma ? "," : "", permission_names[index] ? permission_names[index] : "????"); need_comma = true; } audit_log_end(ab); out: /* release scontext/tcontext */ kfree(tcontext_name); kfree(scontext_name); } /* * security_boundary_permission - drops violated permissions * on boundary constraint. */ static void type_attribute_bounds_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd) { struct context lo_scontext; struct context lo_tcontext, *tcontextp = tcontext; struct av_decision lo_avd; struct type_datum *source; struct type_datum *target; u32 masked = 0; source = policydb->type_val_to_struct[scontext->type - 1]; BUG_ON(!source); if (!source->bounds) return; target = policydb->type_val_to_struct[tcontext->type - 1]; BUG_ON(!target); memset(&lo_avd, 0, sizeof(lo_avd)); memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); lo_scontext.type = source->bounds; if (target->bounds) { memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); lo_tcontext.type = target->bounds; tcontextp = &lo_tcontext; } context_struct_compute_av(policydb, &lo_scontext, tcontextp, tclass, &lo_avd, NULL); masked = ~lo_avd.allowed & avd->allowed; if (likely(!masked)) return; /* no masked permission */ /* mask violated permissions */ avd->allowed &= ~masked; /* audit masked permissions */ security_dump_masked_av(policydb, scontext, tcontext, tclass, masked, "bounds"); } /* * Flag which drivers have permissions and which base permissions are covered. */ void services_compute_xperms_drivers( struct extended_perms *xperms, struct avtab_node *node) { unsigned int i; switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLDRIVER: xperms->base_perms |= AVC_EXT_IOCTL; /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; break; case AVTAB_XPERMS_IOCTLFUNCTION: xperms->base_perms |= AVC_EXT_IOCTL; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); break; case AVTAB_XPERMS_NLMSG: xperms->base_perms |= AVC_EXT_NLMSG; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); break; } xperms->len = 1; } /* * Compute access vectors and extended permissions based on a context * structure pair for the permissions in a particular class. */ static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms) { struct constraint_node *constraint; struct role_allow *ra; struct avtab_key avkey; struct avtab_node *node; struct class_datum *tclass_datum; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { memset(xperms, 0, sizeof(*xperms)); } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { pr_warn_ratelimited("SELinux: Invalid class %u\n", tclass); return; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; /* * If a specific type enforcement rule was defined for * this permission check, then use it. */ avkey.target_class = tclass; avkey.specified = AVTAB_AV | AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) avd->allowed |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITALLOW) avd->auditallow |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITDENY) avd->auditdeny &= node->datum.u.data; else if (xperms && (node->key.specified & AVTAB_XPERMS)) services_compute_xperms_drivers(xperms, node); } /* Check conditional av table for additional permissions */ cond_compute_av(&policydb->te_cond_avtab, &avkey, avd, xperms); } } /* * Remove any permissions prohibited by a constraint (this includes * the MLS policy). */ constraint = tclass_datum->constraints; while (constraint) { if ((constraint->permissions & (avd->allowed)) && !constraint_expr_eval(policydb, scontext, tcontext, NULL, constraint->expr)) { avd->allowed &= ~(constraint->permissions); } constraint = constraint->next; } /* * If checking process transition permission and the * role is changing, then check the (current_role, new_role) * pair. */ if (tclass == policydb->process_class && (avd->allowed & policydb->process_trans_perms) && scontext->role != tcontext->role) { for (ra = policydb->role_allow; ra; ra = ra->next) { if (scontext->role == ra->role && tcontext->role == ra->new_role) break; } if (!ra) avd->allowed &= ~policydb->process_trans_perms; } /* * If the given source and target types have boundary * constraint, lazy checks have to mask any violated * permission and notice it to userspace via audit. */ type_attribute_bounds_av(policydb, scontext, tcontext, tclass, avd); } static int security_validtrans_handle_fail(struct selinux_policy *policy, struct sidtab_entry *oentry, struct sidtab_entry *nentry, struct sidtab_entry *tentry, u16 tclass) { struct policydb *p = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *o = NULL, *n = NULL, *t = NULL; u32 olen, nlen, tlen; if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen)) goto out; if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen)) goto out; if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen)) goto out; audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_validate_transition seresult=denied" " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s", o, n, t, sym_name(p, SYM_CLASSES, tclass-1)); out: kfree(o); kfree(n); kfree(t); if (!enforcing_enabled()) return 0; return -EPERM; } static int security_compute_validatetrans(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass, bool user) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *oentry; struct sidtab_entry *nentry; struct sidtab_entry *tentry; struct class_datum *tclass_datum; struct constraint_node *constraint; u16 tclass; int rc = 0; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!user) tclass = unmap_class(&policy->map, orig_tclass); else tclass = orig_tclass; if (!tclass || tclass > policydb->p_classes.nprim) { rc = -EINVAL; goto out; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; oentry = sidtab_search_entry(sidtab, oldsid); if (!oentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, oldsid); rc = -EINVAL; goto out; } nentry = sidtab_search_entry(sidtab, newsid); if (!nentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, newsid); rc = -EINVAL; goto out; } tentry = sidtab_search_entry(sidtab, tasksid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tasksid); rc = -EINVAL; goto out; } constraint = tclass_datum->validatetrans; while (constraint) { if (!constraint_expr_eval(policydb, &oentry->context, &nentry->context, &tentry->context, constraint->expr)) { if (user) rc = -EPERM; else rc = security_validtrans_handle_fail(policy, oentry, nentry, tentry, tclass); goto out; } constraint = constraint->next; } out: rcu_read_unlock(); return rc; } int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass) { return security_compute_validatetrans(oldsid, newsid, tasksid, tclass, true); } int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass) { return security_compute_validatetrans(oldsid, newsid, tasksid, orig_tclass, false); } /* * security_bounded_transition - check whether the given * transition is directed to bounded, or not. * It returns 0, if @newsid is bounded by @oldsid. * Otherwise, it returns error code. * * @oldsid : current security identifier * @newsid : destinated security identifier */ int security_bounded_transition(u32 old_sid, u32 new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *old_entry, *new_entry; struct type_datum *type; u32 index; int rc; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = -EINVAL; old_entry = sidtab_search_entry(sidtab, old_sid); if (!old_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, old_sid); goto out; } rc = -EINVAL; new_entry = sidtab_search_entry(sidtab, new_sid); if (!new_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, new_sid); goto out; } rc = 0; /* type/domain unchanged */ if (old_entry->context.type == new_entry->context.type) goto out; index = new_entry->context.type; while (true) { type = policydb->type_val_to_struct[index - 1]; BUG_ON(!type); /* not bounded anymore */ rc = -EPERM; if (!type->bounds) break; /* @newsid is bounded by @oldsid */ rc = 0; if (type->bounds == old_entry->context.type) break; index = type->bounds; } if (rc) { char *old_name = NULL; char *new_name = NULL; u32 length; if (!sidtab_entry_to_string(policydb, sidtab, old_entry, &old_name, &length) && !sidtab_entry_to_string(policydb, sidtab, new_entry, &new_name, &length)) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_bounded_transition " "seresult=denied " "oldcontext=%s newcontext=%s", old_name, new_name); } kfree(new_name); kfree(old_name); } out: rcu_read_unlock(); return rc; } static void avd_init(struct selinux_policy *policy, struct av_decision *avd) { avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (policy) avd->seqno = policy->latest_granting; else avd->seqno = 0; avd->flags = 0; } static void update_xperms_extended_data(u8 specified, const struct extended_perms_data *from, struct extended_perms_data *xp_data) { unsigned int i; switch (specified) { case AVTAB_XPERMS_IOCTLDRIVER: memset(xp_data->p, 0xff, sizeof(xp_data->p)); break; case AVTAB_XPERMS_IOCTLFUNCTION: case AVTAB_XPERMS_NLMSG: for (i = 0; i < ARRAY_SIZE(xp_data->p); i++) xp_data->p[i] |= from->p[i]; break; } } void services_compute_xperms_decision(struct extended_perms_decision *xpermd, struct avtab_node *node) { u16 specified; switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLFUNCTION: if (xpermd->base_perm != AVC_EXT_IOCTL || xpermd->driver != node->datum.u.xperms->driver) return; break; case AVTAB_XPERMS_IOCTLDRIVER: if (xpermd->base_perm != AVC_EXT_IOCTL || !security_xperm_test(node->datum.u.xperms->perms.p, xpermd->driver)) return; break; case AVTAB_XPERMS_NLMSG: if (xpermd->base_perm != AVC_EXT_NLMSG || xpermd->driver != node->datum.u.xperms->driver) return; break; default: pr_warn_once( "SELinux: unknown extended permission (%u) will be ignored\n", node->datum.u.xperms->specified); return; } specified = node->key.specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD); if (specified == AVTAB_XPERMS_ALLOWED) { xpermd->used |= XPERMS_ALLOWED; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->allowed); } else if (specified == AVTAB_XPERMS_AUDITALLOW) { xpermd->used |= XPERMS_AUDITALLOW; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->auditallow); } else if (specified == AVTAB_XPERMS_DONTAUDIT) { xpermd->used |= XPERMS_DONTAUDIT; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->dontaudit); } else { pr_warn_once("SELinux: unknown specified key (%u)\n", node->key.specified); } } void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, u8 base_perm, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext, *tcontext; struct avtab_key avkey; struct avtab_node *node; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; xpermd->base_perm = base_perm; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); rcu_read_lock(); if (!selinux_initialized()) goto allow; policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass); goto out; } avkey.target_class = tclass; avkey.specified = AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) services_compute_xperms_decision(xpermd, node); cond_compute_xperms(&policydb->te_cond_avtab, &avkey, xpermd); } } out: rcu_read_unlock(); return; allow: memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); goto out; } /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier * @tsid: target security identifier * @orig_tclass: target security class * @avd: access vector decisions * @xperms: extended permissions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. */ void security_compute_av(u32 ssid, u32 tsid, u16 orig_tclass, struct av_decision *avd, struct extended_perms *xperms) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); xperms->len = 0; if (!selinux_initialized()) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, xperms); map_decision(&policy->map, orig_tclass, avd, policydb->allow_unknown); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); if (!selinux_initialized()) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } if (unlikely(!tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, NULL); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } /* * Write the security context string representation of * the context structure `context' into a dynamically * allocated string of the correct size. Set `*scontext' * to point to this string and set `*scontext_len' to * the length of the string. */ static int context_struct_to_string(struct policydb *p, struct context *context, char **scontext, u32 *scontext_len) { char *scontextp; if (scontext) *scontext = NULL; *scontext_len = 0; if (context->len) { *scontext_len = context->len; if (scontext) { *scontext = kstrdup(context->str, GFP_ATOMIC); if (!(*scontext)) return -ENOMEM; } return 0; } /* Compute the size of the context. */ *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1; *scontext_len += mls_compute_context_len(p, context); if (!scontext) return 0; /* Allocate space for the context; caller must free this space. */ scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; /* * Copy the user name, role name and type name into the context. */ scontextp += sprintf(scontextp, "%s:%s:%s", sym_name(p, SYM_USERS, context->user - 1), sym_name(p, SYM_ROLES, context->role - 1), sym_name(p, SYM_TYPES, context->type - 1)); mls_sid_to_context(p, context, &scontextp); *scontextp = 0; return 0; } static int sidtab_entry_to_string(struct policydb *p, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len) { int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len); if (rc != -ENOENT) return rc; rc = context_struct_to_string(p, &entry->context, scontext, scontext_len); if (!rc && scontext) sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len); return rc; } #include "initial_sid_to_string.h" int security_sidtab_hash_stats(char *page) { struct selinux_policy *policy; int rc; if (!selinux_initialized()) { pr_err("SELinux: %s: called before initial load_policy\n", __func__); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); rc = sidtab_hash_stats(policy->sidtab, page); rcu_read_unlock(); return rc; } const char *security_get_initial_sid_context(u32 sid) { if (unlikely(sid > SECINITSID_NUM)) return NULL; return initial_sid_to_string[sid]; } static int security_sid_to_context_core(u32 sid, char **scontext, u32 *scontext_len, int force, int only_invalid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *entry; int rc = 0; if (scontext) *scontext = NULL; *scontext_len = 0; if (!selinux_initialized()) { if (sid <= SECINITSID_NUM) { char *scontextp; const char *s; /* * Before the policy is loaded, translate * SECINITSID_INIT to "kernel", because systemd and * libselinux < 2.6 take a getcon_raw() result that is * both non-null and not "kernel" to mean that a policy * is already loaded. */ if (sid == SECINITSID_INIT) sid = SECINITSID_KERNEL; s = initial_sid_to_string[sid]; if (!s) return -EINVAL; *scontext_len = strlen(s) + 1; if (!scontext) return 0; scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; return 0; } pr_err("SELinux: %s: called before initial " "load_policy on unknown SID %d\n", __func__, sid); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (force) entry = sidtab_search_entry_force(sidtab, sid); else entry = sidtab_search_entry(sidtab, sid); if (!entry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); rc = -EINVAL; goto out_unlock; } if (only_invalid && !entry->context.len) goto out_unlock; rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext, scontext_len); out_unlock: rcu_read_unlock(); return rc; } /** * security_sid_to_context - Obtain a context for a given SID. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size. Set @scontext * to point to this string and set @scontext_len to the length of the string. */ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 0, 0); } int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 1, 0); } /** * security_sid_to_context_inval - Obtain a context for a given SID if it * is invalid. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size, but only if the * context is invalid in the current policy. Set @scontext to point to * this string (or NULL if the context is valid) and set @scontext_len to * the length of the string (or 0 if the context is valid). */ int security_sid_to_context_inval(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 1, 1); } /* * Caveat: Mutates scontext. */ static int string_to_context_struct(struct policydb *pol, struct sidtab *sidtabp, char *scontext, struct context *ctx, u32 def_sid) { struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *scontextp, *p, oldc; int rc = 0; context_init(ctx); /* Parse the security context. */ rc = -EINVAL; scontextp = scontext; /* Extract the user. */ p = scontextp; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; usrdatum = symtab_search(&pol->p_users, scontextp); if (!usrdatum) goto out; ctx->user = usrdatum->value; /* Extract role. */ scontextp = p; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; role = symtab_search(&pol->p_roles, scontextp); if (!role) goto out; ctx->role = role->value; /* Extract type. */ scontextp = p; while (*p && *p != ':') p++; oldc = *p; *p++ = 0; typdatum = symtab_search(&pol->p_types, scontextp); if (!typdatum || typdatum->attribute) goto out; ctx->type = typdatum->value; rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid); if (rc) goto out; /* Check the validity of the new context. */ rc = -EINVAL; if (!policydb_context_isvalid(pol, ctx)) goto out; rc = 0; out: if (rc) context_destroy(ctx); return rc; } static int security_context_to_sid_core(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags, int force) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; char *scontext2, *str = NULL; struct context context; int rc = 0; /* An empty security context is never valid. */ if (!scontext_len) return -EINVAL; /* Copy the string to allow changes and ensure a NUL terminator */ scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags); if (!scontext2) return -ENOMEM; if (!selinux_initialized()) { u32 i; for (i = 1; i < SECINITSID_NUM; i++) { const char *s = initial_sid_to_string[i]; if (s && !strcmp(s, scontext2)) { *sid = i; goto out; } } *sid = SECINITSID_KERNEL; goto out; } *sid = SECSID_NULL; if (force) { /* Save another copy for storing in uninterpreted form */ rc = -ENOMEM; str = kstrdup(scontext2, gfp_flags); if (!str) goto out; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, &context, def_sid); if (rc == -EINVAL && force) { context.str = str; context.len = strlen(str) + 1; str = NULL; } else if (rc) goto out_unlock; rc = sidtab_context_to_sid(sidtab, &context, sid); if (rc == -ESTALE) { rcu_read_unlock(); if (context.str) { str = context.str; context.str = NULL; } context_destroy(&context); goto retry; } context_destroy(&context); out_unlock: rcu_read_unlock(); out: kfree(scontext2); kfree(str); return rc; } /** * security_context_to_sid - Obtain a SID for a given security context. * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @gfp: context for the allocation * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid, gfp_t gfp) { return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, gfp, 0); } int security_context_str_to_sid(const char *scontext, u32 *sid, gfp_t gfp) { return security_context_to_sid(scontext, strlen(scontext), sid, gfp); } /** * security_context_to_sid_default - Obtain a SID for a given security context, * falling back to specified default if needed. * * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @def_sid: default SID to assign on error * @gfp_flags: the allocator get-free-page (GFP) flags * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * The default SID is passed to the MLS layer to be used to allow * kernel labeling of the MLS field if the MLS field is not present * (for upgrading to MLS without full relabel). * Implicitly forces adding of the context even if it cannot be mapped yet. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid_default(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags) { return security_context_to_sid_core(scontext, scontext_len, sid, def_sid, gfp_flags, 1); } int security_context_to_sid_force(const char *scontext, u32 scontext_len, u32 *sid) { return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, GFP_KERNEL, 1); } static int compute_sid_handle_invalid_context( struct selinux_policy *policy, struct sidtab_entry *sentry, struct sidtab_entry *tentry, u16 tclass, struct context *newcontext) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *s = NULL, *t = NULL, *n = NULL; u32 slen, tlen, nlen; struct audit_buffer *ab; if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen)) goto out; if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen)) goto out; if (context_struct_to_string(policydb, newcontext, &n, &nlen)) goto out; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) goto out; audit_log_format(ab, "op=security_compute_sid invalid_context="); /* no need to record the NUL with untrusted strings */ audit_log_n_untrustedstring(ab, n, nlen - 1); audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s", s, t, sym_name(policydb, SYM_CLASSES, tclass-1)); audit_log_end(ab); out: kfree(s); kfree(t); kfree(n); if (!enforcing_enabled()) return 0; return -EACCES; } static void filename_compute_type(struct policydb *policydb, struct context *newcontext, u32 stype, u32 ttype, u16 tclass, const char *objname) { struct filename_trans_key ft; struct filename_trans_datum *datum; /* * Most filename trans rules are going to live in specific directories * like /dev or /var/run. This bitmap will quickly skip rule searches * if the ttype does not contain any rules. */ if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype)) return; ft.ttype = ttype; ft.tclass = tclass; ft.name = objname; datum = policydb_filenametr_search(policydb, &ft); while (datum) { if (ebitmap_get_bit(&datum->stypes, stype - 1)) { newcontext->type = datum->otype; return; } datum = datum->next; } } static int security_compute_sid(u32 ssid, u32 tsid, u16 orig_tclass, u16 specified, const char *objname, u32 *out_sid, bool kern) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct class_datum *cladatum; struct context *scontext, *tcontext, newcontext; struct sidtab_entry *sentry, *tentry; struct avtab_key avkey; struct avtab_node *avnode, *node; u16 tclass; int rc = 0; bool sock; if (!selinux_initialized()) { switch (orig_tclass) { case SECCLASS_PROCESS: /* kernel value */ *out_sid = ssid; break; default: *out_sid = tsid; break; } goto out; } retry: cladatum = NULL; context_init(&newcontext); rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); if (kern) { tclass = unmap_class(&policy->map, orig_tclass); sock = security_is_socket_class(orig_tclass); } else { tclass = orig_tclass; sock = security_is_socket_class(map_class(&policy->map, tclass)); } policydb = &policy->policydb; sidtab = policy->sidtab; sentry = sidtab_search_entry(sidtab, ssid); if (!sentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); rc = -EINVAL; goto out_unlock; } tentry = sidtab_search_entry(sidtab, tsid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); rc = -EINVAL; goto out_unlock; } scontext = &sentry->context; tcontext = &tentry->context; if (tclass && tclass <= policydb->p_classes.nprim) cladatum = policydb->class_val_to_struct[tclass - 1]; /* Set the user identity. */ switch (specified) { case AVTAB_TRANSITION: case AVTAB_CHANGE: if (cladatum && cladatum->default_user == DEFAULT_TARGET) { newcontext.user = tcontext->user; } else { /* notice this gets both DEFAULT_SOURCE and unset */ /* Use the process user identity. */ newcontext.user = scontext->user; } break; case AVTAB_MEMBER: /* Use the related object owner. */ newcontext.user = tcontext->user; break; } /* Set the role to default values. */ if (cladatum && cladatum->default_role == DEFAULT_SOURCE) { newcontext.role = scontext->role; } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) { newcontext.role = tcontext->role; } else { if ((tclass == policydb->process_class) || sock) newcontext.role = scontext->role; else newcontext.role = OBJECT_R_VAL; } /* Set the type. * Look for a type transition/member/change rule. */ avkey.source_type = scontext->type; avkey.target_type = tcontext->type; avkey.target_class = tclass; avkey.specified = specified; avnode = avtab_search_node(&policydb->te_avtab, &avkey); /* If no permanent rule, also check for enabled conditional rules */ if (!avnode) { node = avtab_search_node(&policydb->te_cond_avtab, &avkey); for (; node; node = avtab_search_node_next(node, specified)) { if (node->key.specified & AVTAB_ENABLED) { avnode = node; break; } } } /* If a permanent rule is found, use the type from * the type transition/member/change rule. Otherwise, * set the type to its default values. */ if (avnode) { newcontext.type = avnode->datum.u.data; } else if (cladatum && cladatum->default_type == DEFAULT_SOURCE) { newcontext.type = scontext->type; } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) { newcontext.type = tcontext->type; } else { if ((tclass == policydb->process_class) || sock) { /* Use the type of process. */ newcontext.type = scontext->type; } else { /* Use the type of the related object. */ newcontext.type = tcontext->type; } } /* if we have a objname this is a file trans check so check those rules */ if (objname) filename_compute_type(policydb, &newcontext, scontext->type, tcontext->type, tclass, objname); /* Check for class-specific changes. */ if (specified & AVTAB_TRANSITION) { /* Look for a role transition rule. */ struct role_trans_datum *rtd; struct role_trans_key rtk = { .role = scontext->role, .type = tcontext->type, .tclass = tclass, }; rtd = policydb_roletr_search(policydb, &rtk); if (rtd) newcontext.role = rtd->new_role; } /* Set the MLS attributes. This is done last because it may allocate memory. */ rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified, &newcontext, sock); if (rc) goto out_unlock; /* Check the validity of the context. */ if (!policydb_context_isvalid(policydb, &newcontext)) { rc = compute_sid_handle_invalid_context(policy, sentry, tentry, tclass, &newcontext); if (rc) goto out_unlock; } /* Obtain the sid for the context. */ rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcontext); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcontext); out: return rc; } /** * security_transition_sid - Compute the SID for a new subject/object. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @qstr: object name * @out_sid: security identifier for new subject/object * * Compute a SID to use for labeling a new subject or object in the * class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the new SID was * computed successfully. */ int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, qstr ? qstr->name : NULL, out_sid, true); } int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, objname, out_sid, false); } /** * security_member_sid - Compute the SID for member selection. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use when selecting a member of a polyinstantiated * object of class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_MEMBER, NULL, out_sid, false); } /** * security_change_sid - Compute the SID for object relabeling. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use for relabeling an object of class @tclass * based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, NULL, out_sid, false); } static inline int convert_context_handle_invalid_context( struct policydb *policydb, struct context *context) { char *s; u32 len; if (enforcing_enabled()) return -EINVAL; if (!context_struct_to_string(policydb, context, &s, &len)) { pr_warn("SELinux: Context %s would be invalid if enforcing\n", s); kfree(s); } return 0; } /** * services_convert_context - Convert a security context across policies. * @args: populated convert_context_args struct * @oldc: original context * @newc: converted context * @gfp_flags: allocation flags * * Convert the values in the security context structure @oldc from the values * specified in the policy @args->oldp to the values specified in the policy * @args->newp, storing the new context in @newc, and verifying that the * context is valid under the new policy. */ int services_convert_context(struct convert_context_args *args, struct context *oldc, struct context *newc, gfp_t gfp_flags) { struct ocontext *oc; struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *s; u32 len; int rc; if (oldc->str) { s = kstrdup(oldc->str, gfp_flags); if (!s) return -ENOMEM; rc = string_to_context_struct(args->newp, NULL, s, newc, SECSID_NULL); if (rc == -EINVAL) { /* * Retain string representation for later mapping. * * IMPORTANT: We need to copy the contents of oldc->str * back into s again because string_to_context_struct() * may have garbled it. */ memcpy(s, oldc->str, oldc->len); context_init(newc); newc->str = s; newc->len = oldc->len; return 0; } kfree(s); if (rc) { /* Other error condition, e.g. ENOMEM. */ pr_err("SELinux: Unable to map context %s, rc = %d.\n", oldc->str, -rc); return rc; } pr_info("SELinux: Context %s became valid (mapped).\n", oldc->str); return 0; } context_init(newc); /* Convert the user. */ usrdatum = symtab_search(&args->newp->p_users, sym_name(args->oldp, SYM_USERS, oldc->user - 1)); if (!usrdatum) goto bad; newc->user = usrdatum->value; /* Convert the role. */ role = symtab_search(&args->newp->p_roles, sym_name(args->oldp, SYM_ROLES, oldc->role - 1)); if (!role) goto bad; newc->role = role->value; /* Convert the type. */ typdatum = symtab_search(&args->newp->p_types, sym_name(args->oldp, SYM_TYPES, oldc->type - 1)); if (!typdatum) goto bad; newc->type = typdatum->value; /* Convert the MLS fields if dealing with MLS policies */ if (args->oldp->mls_enabled && args->newp->mls_enabled) { rc = mls_convert_context(args->oldp, args->newp, oldc, newc); if (rc) goto bad; } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) { /* * Switching between non-MLS and MLS policy: * ensure that the MLS fields of the context for all * existing entries in the sidtab are filled in with a * suitable default value, likely taken from one of the * initial SIDs. */ oc = args->newp->ocontexts[OCON_ISID]; while (oc && oc->sid[0] != SECINITSID_UNLABELED) oc = oc->next; if (!oc) { pr_err("SELinux: unable to look up" " the initial SIDs list\n"); goto bad; } rc = mls_range_set(newc, &oc->context[0].range); if (rc) goto bad; } /* Check the validity of the new context. */ if (!policydb_context_isvalid(args->newp, newc)) { rc = convert_context_handle_invalid_context(args->oldp, oldc); if (rc) goto bad; } return 0; bad: /* Map old representation to string and save it. */ rc = context_struct_to_string(args->oldp, oldc, &s, &len); if (rc) return rc; context_destroy(newc); newc->str = s; newc->len = len; pr_info("SELinux: Context %s became invalid (unmapped).\n", newc->str); return 0; } static void security_load_policycaps(struct selinux_policy *policy) { struct policydb *p; unsigned int i; struct ebitmap_node *node; p = &policy->policydb; for (i = 0; i < ARRAY_SIZE(selinux_state.policycap); i++) WRITE_ONCE(selinux_state.policycap[i], ebitmap_get_bit(&p->policycaps, i)); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) pr_info("SELinux: policy capability %s=%d\n", selinux_policycap_names[i], ebitmap_get_bit(&p->policycaps, i)); ebitmap_for_each_positive_bit(&p->policycaps, node, i) { if (i >= ARRAY_SIZE(selinux_policycap_names)) pr_info("SELinux: unknown policy capability %u\n", i); } } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy); static void selinux_policy_free(struct selinux_policy *policy) { if (!policy) return; sidtab_destroy(policy->sidtab); kfree(policy->map.mapping); policydb_destroy(&policy->policydb); kfree(policy->sidtab); kfree(policy); } static void selinux_policy_cond_free(struct selinux_policy *policy) { cond_policydb_destroy_dup(&policy->policydb); kfree(policy); } void selinux_policy_cancel(struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); sidtab_cancel_convert(oldpolicy->sidtab); selinux_policy_free(load_state->policy); kfree(load_state->convert_data); } static void selinux_notify_policy_change(u32 seqno) { /* Flush external caches and notify userspace of policy load */ avc_ss_reset(seqno); selnl_notify_policyload(seqno); selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); selinux_ima_measure_state_locked(); } void selinux_policy_commit(struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy, *newpolicy = load_state->policy; unsigned long flags; u32 seqno; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* If switching between different policy types, log MLS status */ if (oldpolicy) { if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled) pr_info("SELinux: Disabling MLS support...\n"); else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled) pr_info("SELinux: Enabling MLS support...\n"); } /* Set latest granting seqno for new policy. */ if (oldpolicy) newpolicy->latest_granting = oldpolicy->latest_granting + 1; else newpolicy->latest_granting = 1; seqno = newpolicy->latest_granting; /* Install the new policy. */ if (oldpolicy) { sidtab_freeze_begin(oldpolicy->sidtab, &flags); rcu_assign_pointer(state->policy, newpolicy); sidtab_freeze_end(oldpolicy->sidtab, &flags); } else { rcu_assign_pointer(state->policy, newpolicy); } /* Load the policycaps from the new policy */ security_load_policycaps(newpolicy); if (!selinux_initialized()) { /* * After first policy load, the security server is * marked as initialized and ready to handle requests and * any objects created prior to policy load are then labeled. */ selinux_mark_initialized(); selinux_complete_init(); } /* Free the old policy */ synchronize_rcu(); selinux_policy_free(oldpolicy); kfree(load_state->convert_data); /* Notify others of the policy change */ selinux_notify_policy_change(seqno); } /** * security_load_policy - Load a security policy configuration. * @data: binary policy data * @len: length of data in bytes * @load_state: policy load state * * Load a new set of security policy configuration data, * validate it and convert the SID table as necessary. * This function will flush the access vector cache after * loading the new policy. */ int security_load_policy(void *data, size_t len, struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; struct selinux_policy_convert_data *convert_data; int rc = 0; struct policy_file file = { data, len }, *fp = &file; newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL); if (!newpolicy->sidtab) { rc = -ENOMEM; goto err_policy; } rc = policydb_read(&newpolicy->policydb, fp); if (rc) goto err_sidtab; newpolicy->policydb.len = len; rc = selinux_set_mapping(&newpolicy->policydb, secclass_map, &newpolicy->map); if (rc) goto err_policydb; rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab); if (rc) { pr_err("SELinux: unable to load the initial SIDs\n"); goto err_mapping; } if (!selinux_initialized()) { /* First policy load, so no need to preserve state from old policy */ load_state->policy = newpolicy; load_state->convert_data = NULL; return 0; } oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Preserve active boolean values from the old policy */ rc = security_preserve_bools(oldpolicy, newpolicy); if (rc) { pr_err("SELinux: unable to preserve booleans\n"); goto err_free_isids; } /* * Convert the internal representations of contexts * in the new SID table. */ convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL); if (!convert_data) { rc = -ENOMEM; goto err_free_isids; } convert_data->args.oldp = &oldpolicy->policydb; convert_data->args.newp = &newpolicy->policydb; convert_data->sidtab_params.args = &convert_data->args; convert_data->sidtab_params.target = newpolicy->sidtab; rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params); if (rc) { pr_err("SELinux: unable to convert the internal" " representation of contexts in the new SID" " table\n"); goto err_free_convert_data; } load_state->policy = newpolicy; load_state->convert_data = convert_data; return 0; err_free_convert_data: kfree(convert_data); err_free_isids: sidtab_destroy(newpolicy->sidtab); err_mapping: kfree(newpolicy->map.mapping); err_policydb: policydb_destroy(&newpolicy->policydb); err_sidtab: kfree(newpolicy->sidtab); err_policy: kfree(newpolicy); return rc; } /** * ocontext_to_sid - Helper to safely get sid for an ocontext * @sidtab: SID table * @c: ocontext structure * @index: index of the context entry (0 or 1) * @out_sid: pointer to the resulting SID value * * For all ocontexts except OCON_ISID the SID fields are populated * on-demand when needed. Since updating the SID value is an SMP-sensitive * operation, this helper must be used to do that safely. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c, size_t index, u32 *out_sid) { int rc; u32 sid; /* Ensure the associated sidtab entry is visible to this thread. */ sid = smp_load_acquire(&c->sid[index]); if (!sid) { rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid); if (rc) return rc; /* * Ensure the new sidtab entry is visible to other threads * when they see the SID. */ smp_store_release(&c->sid[index], sid); } *out_sid = sid; return 0; } /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number * @port: port number * @out_sid: security identifier */ int security_port_sid(u8 protocol, u16 port, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_PORT; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_PORT]; while (c) { if (c->u.port.protocol == protocol && c->u.port.low_port <= port && c->u.port.high_port >= port) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_PORT; } out: rcu_read_unlock(); return rc; } /** * security_ib_pkey_sid - Obtain the SID for a pkey. * @subnet_prefix: Subnet Prefix * @pkey_num: pkey number * @out_sid: security identifier */ int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBPKEY]; while (c) { if (c->u.ibpkey.low_pkey <= pkey_num && c->u.ibpkey.high_pkey >= pkey_num && c->u.ibpkey.subnet_prefix == subnet_prefix) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_ib_endport_sid - Obtain the SID for a subnet management interface. * @dev_name: device name * @port_num: port number * @out_sid: security identifier */ int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBENDPORT]; while (c) { if (c->u.ibendport.port == port_num && !strncmp(c->u.ibendport.dev_name, dev_name, IB_DEVICE_NAME_MAX)) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_netif_sid - Obtain the SID for a network interface. * @name: interface name * @if_sid: interface SID */ int security_netif_sid(char *name, u32 *if_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; if (!selinux_initialized()) { *if_sid = SECINITSID_NETIF; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_NETIF]; while (c) { if (strcmp(name, c->u.name) == 0) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, if_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *if_sid = SECINITSID_NETIF; out: rcu_read_unlock(); return rc; } static bool match_ipv6_addrmask(const u32 input[4], const u32 addr[4], const u32 mask[4]) { int i; for (i = 0; i < 4; i++) if (addr[i] != (input[i] & mask[i])) return false; return true; } /** * security_node_sid - Obtain the SID for a node (host). * @domain: communication domain aka address family * @addrp: address * @addrlen: address length in bytes * @out_sid: security identifier */ int security_node_sid(u16 domain, void *addrp, u32 addrlen, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; if (!selinux_initialized()) { *out_sid = SECINITSID_NODE; return 0; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; switch (domain) { case AF_INET: { u32 addr; rc = -EINVAL; if (addrlen != sizeof(u32)) goto out; addr = *((u32 *)addrp); c = policydb->ocontexts[OCON_NODE]; while (c) { if (c->u.node.addr == (addr & c->u.node.mask)) break; c = c->next; } break; } case AF_INET6: rc = -EINVAL; if (addrlen != sizeof(u64) * 2) goto out; c = policydb->ocontexts[OCON_NODE6]; while (c) { if (match_ipv6_addrmask(addrp, c->u.node6.addr, c->u.node6.mask)) break; c = c->next; } break; default: rc = 0; *out_sid = SECINITSID_NODE; goto out; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_NODE; } rc = 0; out: rcu_read_unlock(); return rc; } #define SIDS_NEL 25 /** * security_get_user_sids - Obtain reachable SIDs for a user. * @fromsid: starting SID * @username: username * @sids: array of reachable SIDs for user * @nel: number of elements in @sids * * Generate the set of SIDs for legal security contexts * for a given user that can be reached by @fromsid. * Set *@sids to point to a dynamically allocated * array containing the set of SIDs. Set *@nel to the * number of elements in the array. */ int security_get_user_sids(u32 fromsid, const char *username, u32 **sids, u32 *nel) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *fromcon, usercon; u32 *mysids = NULL, *mysids2, sid; u32 i, j, mynel, maxnel = SIDS_NEL; struct user_datum *user; struct role_datum *role; struct ebitmap_node *rnode, *tnode; int rc; *sids = NULL; *nel = 0; if (!selinux_initialized()) return 0; mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL); if (!mysids) return -ENOMEM; retry: mynel = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; context_init(&usercon); rc = -EINVAL; fromcon = sidtab_search(sidtab, fromsid); if (!fromcon) goto out_unlock; rc = -EINVAL; user = symtab_search(&policydb->p_users, username); if (!user) goto out_unlock; usercon.user = user->value; ebitmap_for_each_positive_bit(&user->roles, rnode, i) { role = policydb->role_val_to_struct[i]; usercon.role = i + 1; ebitmap_for_each_positive_bit(&role->types, tnode, j) { usercon.type = j + 1; if (mls_setup_user_range(policydb, fromcon, user, &usercon)) continue; rc = sidtab_context_to_sid(sidtab, &usercon, &sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out_unlock; if (mynel < maxnel) { mysids[mynel++] = sid; } else { rc = -ENOMEM; maxnel += SIDS_NEL; mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC); if (!mysids2) goto out_unlock; memcpy(mysids2, mysids, mynel * sizeof(*mysids2)); kfree(mysids); mysids = mysids2; mysids[mynel++] = sid; } } } rc = 0; out_unlock: rcu_read_unlock(); if (rc || !mynel) { kfree(mysids); return rc; } rc = -ENOMEM; mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL); if (!mysids2) { kfree(mysids); return rc; } for (i = 0, j = 0; i < mynel; i++) { struct av_decision dummy_avd; rc = avc_has_perm_noaudit(fromsid, mysids[i], SECCLASS_PROCESS, /* kernel value */ PROCESS__TRANSITION, AVC_STRICT, &dummy_avd); if (!rc) mysids2[j++] = mysids[i]; cond_resched(); } kfree(mysids); *sids = mysids2; *nel = j; return 0; } /** * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem * @policy: policy * @fstype: filesystem type * @path: path from root of mount * @orig_sclass: file security class * @sid: SID for path * * Obtain a SID to use for a file in a filesystem that * cannot support xattr or use a fixed labeling behavior like * transition SIDs or task SIDs. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static inline int __security_genfs_sid(struct selinux_policy *policy, const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; u16 sclass; struct genfs *genfs; struct ocontext *c; int cmp = 0; while (path[0] == '/' && path[1] == '/') path++; sclass = unmap_class(&policy->map, orig_sclass); *sid = SECINITSID_UNLABELED; for (genfs = policydb->genfs; genfs; genfs = genfs->next) { cmp = strcmp(fstype, genfs->fstype); if (cmp <= 0) break; } if (!genfs || cmp) return -ENOENT; for (c = genfs->head; c; c = c->next) { size_t len = strlen(c->u.name); if ((!c->v.sclass || sclass == c->v.sclass) && (strncmp(c->u.name, path, len) == 0)) break; } if (!c) return -ENOENT; return ocontext_to_sid(sidtab, c, 0, sid); } /** * security_genfs_sid - Obtain a SID for a file in a filesystem * @fstype: filesystem type * @path: path from root of mount * @orig_sclass: file security class * @sid: SID for path * * Acquire policy_rwlock before calling __security_genfs_sid() and release * it afterward. */ int security_genfs_sid(const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { struct selinux_policy *policy; int retval; if (!selinux_initialized()) { *sid = SECINITSID_UNLABELED; return 0; } do { rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); retval = __security_genfs_sid(policy, fstype, path, orig_sclass, sid); rcu_read_unlock(); } while (retval == -ESTALE); return retval; } int selinux_policy_genfs_sid(struct selinux_policy *policy, const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { /* no lock required, policy is not yet accessible by other threads */ return __security_genfs_sid(policy, fstype, path, orig_sclass, sid); } /** * security_fs_use - Determine how to handle labeling for a filesystem. * @sb: superblock in question */ int security_fs_use(struct super_block *sb) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; struct superblock_security_struct *sbsec = selinux_superblock(sb); const char *fstype = sb->s_type->name; if (!selinux_initialized()) { sbsec->behavior = SECURITY_FS_USE_NONE; sbsec->sid = SECINITSID_UNLABELED; return 0; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_FSUSE]; while (c) { if (strcmp(fstype, c->u.name) == 0) break; c = c->next; } if (c) { sbsec->behavior = c->v.behavior; rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { rc = __security_genfs_sid(policy, fstype, "/", SECCLASS_DIR, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) { sbsec->behavior = SECURITY_FS_USE_NONE; rc = 0; } else { sbsec->behavior = SECURITY_FS_USE_GENFS; } } out: rcu_read_unlock(); return rc; } int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values) { struct policydb *policydb; u32 i; int rc; policydb = &policy->policydb; *names = NULL; *values = NULL; rc = 0; *len = policydb->p_bools.nprim; if (!*len) goto out; rc = -ENOMEM; *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC); if (!*names) goto err; rc = -ENOMEM; *values = kcalloc(*len, sizeof(int), GFP_ATOMIC); if (!*values) goto err; for (i = 0; i < *len; i++) { (*values)[i] = policydb->bool_val_to_struct[i]->state; rc = -ENOMEM; (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i), GFP_ATOMIC); if (!(*names)[i]) goto err; } rc = 0; out: return rc; err: if (*names) { for (i = 0; i < *len; i++) kfree((*names)[i]); kfree(*names); } kfree(*values); *len = 0; *names = NULL; *values = NULL; goto out; } int security_set_bools(u32 len, const int *values) { struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; int rc; u32 i, seqno = 0; if (!selinux_initialized()) return -EINVAL; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Consistency check on number of booleans, should never fail */ if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim)) return -EINVAL; newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; /* * Deep copy only the parts of the policydb that might be * modified as a result of changing booleans. */ rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb); if (rc) { kfree(newpolicy); return -ENOMEM; } /* Update the boolean states in the copy */ for (i = 0; i < len; i++) { int new_state = !!values[i]; int old_state = newpolicy->policydb.bool_val_to_struct[i]->state; if (new_state != old_state) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_MAC_CONFIG_CHANGE, "bool=%s val=%d old_val=%d auid=%u ses=%u", sym_name(&newpolicy->policydb, SYM_BOOLS, i), new_state, old_state, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); newpolicy->policydb.bool_val_to_struct[i]->state = new_state; } } /* Re-evaluate the conditional rules in the copy */ evaluate_cond_nodes(&newpolicy->policydb); /* Set latest granting seqno for new policy */ newpolicy->latest_granting = oldpolicy->latest_granting + 1; seqno = newpolicy->latest_granting; /* Install the new policy */ rcu_assign_pointer(state->policy, newpolicy); /* * Free the conditional portions of the old policydb * that were copied for the new policy, and the oldpolicy * structure itself but not what it references. */ synchronize_rcu(); selinux_policy_cond_free(oldpolicy); /* Notify others of the policy change */ selinux_notify_policy_change(seqno); return 0; } int security_get_bool_value(u32 index) { struct selinux_policy *policy; struct policydb *policydb; int rc; u32 len; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -EFAULT; len = policydb->p_bools.nprim; if (index >= len) goto out; rc = policydb->bool_val_to_struct[index]->state; out: rcu_read_unlock(); return rc; } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy) { int rc, *bvalues = NULL; char **bnames = NULL; struct cond_bool_datum *booldatum; u32 i, nbools = 0; rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues); if (rc) goto out; for (i = 0; i < nbools; i++) { booldatum = symtab_search(&newpolicy->policydb.p_bools, bnames[i]); if (booldatum) booldatum->state = bvalues[i]; } evaluate_cond_nodes(&newpolicy->policydb); out: if (bnames) { for (i = 0; i < nbools; i++) kfree(bnames[i]); } kfree(bnames); kfree(bvalues); return rc; } /* * security_sid_mls_copy() - computes a new sid based on the given * sid and the mls portion of mls_sid. */ int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *context1; struct context *context2; struct context newcon; char *s; u32 len; int rc; if (!selinux_initialized()) { *new_sid = sid; return 0; } retry: rc = 0; context_init(&newcon); rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!policydb->mls_enabled) { *new_sid = sid; goto out_unlock; } rc = -EINVAL; context1 = sidtab_search(sidtab, sid); if (!context1) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); goto out_unlock; } rc = -EINVAL; context2 = sidtab_search(sidtab, mls_sid); if (!context2) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, mls_sid); goto out_unlock; } newcon.user = context1->user; newcon.role = context1->role; newcon.type = context1->type; rc = mls_context_cpy(&newcon, context2); if (rc) goto out_unlock; /* Check the validity of the new context. */ if (!policydb_context_isvalid(policydb, &newcon)) { rc = convert_context_handle_invalid_context(policydb, &newcon); if (rc) { if (!context_struct_to_string(policydb, &newcon, &s, &len)) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=security_sid_mls_copy invalid_context="); /* don't record NUL with untrusted strings */ audit_log_n_untrustedstring(ab, s, len - 1); audit_log_end(ab); kfree(s); } goto out_unlock; } } rc = sidtab_context_to_sid(sidtab, &newcon, new_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcon); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcon); return rc; } /** * security_net_peersid_resolve - Compare and resolve two network peer SIDs * @nlbl_sid: NetLabel SID * @nlbl_type: NetLabel labeling protocol type * @xfrm_sid: XFRM SID * @peer_sid: network peer sid * * Description: * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be * resolved into a single SID it is returned via @peer_sid and the function * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function * returns a negative value. A table summarizing the behavior is below: * * | function return | @sid * ------------------------------+-----------------+----------------- * no peer labels | 0 | SECSID_NULL * single peer label | 0 | <peer_label> * multiple, consistent labels | 0 | <peer_label> * multiple, inconsistent labels | -<errno> | SECSID_NULL * */ int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *nlbl_ctx; struct context *xfrm_ctx; *peer_sid = SECSID_NULL; /* handle the common (which also happens to be the set of easy) cases * right away, these two if statements catch everything involving a * single or absent peer SID/label */ if (xfrm_sid == SECSID_NULL) { *peer_sid = nlbl_sid; return 0; } /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label * is present */ if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { *peer_sid = xfrm_sid; return 0; } if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; /* * We don't need to check initialized here since the only way both * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the * security server was initialized and state->initialized was true. */ if (!policydb->mls_enabled) { rc = 0; goto out; } rc = -EINVAL; nlbl_ctx = sidtab_search(sidtab, nlbl_sid); if (!nlbl_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, nlbl_sid); goto out; } rc = -EINVAL; xfrm_ctx = sidtab_search(sidtab, xfrm_sid); if (!xfrm_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, xfrm_sid); goto out; } rc = (mls_context_equal(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); if (rc) goto out; /* at present NetLabel SIDs/labels really only carry MLS * information so if the MLS portion of the NetLabel SID * matches the MLS portion of the labeled XFRM SID/label * then pass along the XFRM SID as it is the most * expressive */ *peer_sid = xfrm_sid; out: rcu_read_unlock(); return rc; } static int get_classes_callback(void *k, void *d, void *args) { struct class_datum *datum = d; char *name = k, **classes = args; u32 value = datum->value - 1; classes[value] = kstrdup(name, GFP_ATOMIC); if (!classes[value]) return -ENOMEM; return 0; } int security_get_classes(struct selinux_policy *policy, char ***classes, u32 *nclasses) { struct policydb *policydb; int rc; policydb = &policy->policydb; rc = -ENOMEM; *nclasses = policydb->p_classes.nprim; *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC); if (!*classes) goto out; rc = hashtab_map(&policydb->p_classes.table, get_classes_callback, *classes); if (rc) { u32 i; for (i = 0; i < *nclasses; i++) kfree((*classes)[i]); kfree(*classes); } out: return rc; } static int get_permissions_callback(void *k, void *d, void *args) { struct perm_datum *datum = d; char *name = k, **perms = args; u32 value = datum->value - 1; perms[value] = kstrdup(name, GFP_ATOMIC); if (!perms[value]) return -ENOMEM; return 0; } int security_get_permissions(struct selinux_policy *policy, const char *class, char ***perms, u32 *nperms) { struct policydb *policydb; u32 i; int rc; struct class_datum *match; policydb = &policy->policydb; rc = -EINVAL; match = symtab_search(&policydb->p_classes, class); if (!match) { pr_err("SELinux: %s: unrecognized class %s\n", __func__, class); goto out; } rc = -ENOMEM; *nperms = match->permissions.nprim; *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC); if (!*perms) goto out; if (match->comdatum) { rc = hashtab_map(&match->comdatum->permissions.table, get_permissions_callback, *perms); if (rc) goto err; } rc = hashtab_map(&match->permissions.table, get_permissions_callback, *perms); if (rc) goto err; out: return rc; err: for (i = 0; i < *nperms; i++) kfree((*perms)[i]); kfree(*perms); return rc; } int security_get_reject_unknown(void) { struct selinux_policy *policy; int value; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); value = policy->policydb.reject_unknown; rcu_read_unlock(); return value; } int security_get_allow_unknown(void) { struct selinux_policy *policy; int value; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); value = policy->policydb.allow_unknown; rcu_read_unlock(); return value; } /** * security_policycap_supported - Check for a specific policy capability * @req_cap: capability * * Description: * This function queries the currently loaded policy to see if it supports the * capability specified by @req_cap. Returns true (1) if the capability is * supported, false (0) if it isn't supported. * */ int security_policycap_supported(unsigned int req_cap) { struct selinux_policy *policy; int rc; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap); rcu_read_unlock(); return rc; } struct selinux_audit_rule { u32 au_seqno; struct context au_ctxt; }; void selinux_audit_rule_free(void *vrule) { struct selinux_audit_rule *rule = vrule; if (rule) { context_destroy(&rule->au_ctxt); kfree(rule); } } int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct policydb *policydb; struct selinux_audit_rule *tmprule; struct role_datum *roledatum; struct type_datum *typedatum; struct user_datum *userdatum; struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule; int rc = 0; *rule = NULL; if (!selinux_initialized()) return -EOPNOTSUPP; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: /* only 'equals' and 'not equals' fit user, role, and type */ if (op != Audit_equal && op != Audit_not_equal) return -EINVAL; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* we do not allow a range, indicated by the presence of '-' */ if (strchr(rulestr, '-')) return -EINVAL; break; default: /* only the above fields are valid */ return -EINVAL; } tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; tmprule->au_seqno = policy->latest_granting; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: userdatum = symtab_search(&policydb->p_users, rulestr); if (!userdatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.user = userdatum->value; break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: roledatum = symtab_search(&policydb->p_roles, rulestr); if (!roledatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.role = roledatum->value; break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: typedatum = symtab_search(&policydb->p_types, rulestr); if (!typedatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.type = typedatum->value; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt, GFP_ATOMIC); if (rc) goto err; break; } rcu_read_unlock(); *rule = tmprule; return 0; err: rcu_read_unlock(); selinux_audit_rule_free(tmprule); *rule = NULL; return rc; } /* Check to see if the rule contains any selinux fields */ int selinux_audit_rule_known(struct audit_krule *rule) { u32 i; for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: return 1; } } return 0; } int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct context *ctxt; struct mls_level *level; struct selinux_audit_rule *rule = vrule; int match = 0; if (unlikely(!rule)) { WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n"); return -ENOENT; } if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); if (rule->au_seqno < policy->latest_granting) { match = -ESTALE; goto out; } ctxt = sidtab_search(policy->sidtab, prop->selinux.secid); if (unlikely(!ctxt)) { WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n", prop->selinux.secid); match = -ENOENT; goto out; } /* a field/op pair that is not caught here will simply fall through without a match */ switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: switch (op) { case Audit_equal: match = (ctxt->user == rule->au_ctxt.user); break; case Audit_not_equal: match = (ctxt->user != rule->au_ctxt.user); break; } break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: switch (op) { case Audit_equal: match = (ctxt->role == rule->au_ctxt.role); break; case Audit_not_equal: match = (ctxt->role != rule->au_ctxt.role); break; } break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: switch (op) { case Audit_equal: match = (ctxt->type == rule->au_ctxt.type); break; case Audit_not_equal: match = (ctxt->type != rule->au_ctxt.type); break; } break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: level = ((field == AUDIT_SUBJ_SEN || field == AUDIT_OBJ_LEV_LOW) ? &ctxt->range.level[0] : &ctxt->range.level[1]); switch (op) { case Audit_equal: match = mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_not_equal: match = !mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_lt: match = (mls_level_dom(&rule->au_ctxt.range.level[0], level) && !mls_level_eq(&rule->au_ctxt.range.level[0], level)); break; case Audit_le: match = mls_level_dom(&rule->au_ctxt.range.level[0], level); break; case Audit_gt: match = (mls_level_dom(level, &rule->au_ctxt.range.level[0]) && !mls_level_eq(level, &rule->au_ctxt.range.level[0])); break; case Audit_ge: match = mls_level_dom(level, &rule->au_ctxt.range.level[0]); break; } } out: rcu_read_unlock(); return match; } static int aurule_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) return audit_update_lsm_rules(); return 0; } static int __init aurule_init(void) { int err; err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET); if (err) panic("avc_add_callback() failed, error %d\n", err); return err; } __initcall(aurule_init); #ifdef CONFIG_NETLABEL /** * security_netlbl_cache_add - Add an entry to the NetLabel cache * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in * @skb, in the NetLabel subsystem cache. This function assumes @secattr has * already been initialized. * */ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, u32 sid) { u32 *sid_cache; sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); if (sid_cache == NULL) return; secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); if (secattr->cache == NULL) { kfree(sid_cache); return; } *sid_cache = sid; secattr->cache->free = kfree; secattr->cache->data = sid_cache; secattr->flags |= NETLBL_SECATTR_CACHE; } /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux * SID/context then use SECINITSID_NETMSG as the foundation. If possible the * 'cache' field of @secattr is set and the CACHE flag is set; this is to * allow the @secattr to be used by NetLabel to cache the secattr to SID * conversion for future lookups. Returns zero on success, negative values on * failure. * */ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *ctx; struct context ctx_new; if (!selinux_initialized()) { *sid = SECSID_NULL; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (secattr->flags & NETLBL_SECATTR_CACHE) *sid = *(u32 *)secattr->cache->data; else if (secattr->flags & NETLBL_SECATTR_SECID) *sid = secattr->attr.secid; else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { rc = -EIDRM; ctx = sidtab_search(sidtab, SECINITSID_NETMSG); if (ctx == NULL) goto out; context_init(&ctx_new); ctx_new.user = ctx->user; ctx_new.role = ctx->role; ctx_new.type = ctx->type; mls_import_netlbl_lvl(policydb, &ctx_new, secattr); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr); if (rc) goto out; } rc = -EIDRM; if (!mls_context_isvalid(policydb, &ctx_new)) { ebitmap_destroy(&ctx_new.range.level[0].cat); goto out; } rc = sidtab_context_to_sid(sidtab, &ctx_new, sid); ebitmap_destroy(&ctx_new.range.level[0].cat); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; security_netlbl_cache_add(secattr, *sid); } else *sid = SECSID_NULL; out: rcu_read_unlock(); return rc; } /** * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr * @sid: the SELinux SID * @secattr: the NetLabel packet security attributes * * Description: * Convert the given SELinux SID in @sid into a NetLabel security attribute. * Returns zero on success, negative values on failure. * */ int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) { struct selinux_policy *policy; struct policydb *policydb; int rc; struct context *ctx; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -ENOENT; ctx = sidtab_search(policy->sidtab, sid); if (ctx == NULL) goto out; rc = -ENOMEM; secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1), GFP_ATOMIC); if (secattr->domain == NULL) goto out; secattr->attr.secid = sid; secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID; mls_export_netlbl_lvl(policydb, ctx, secattr); rc = mls_export_netlbl_cat(policydb, ctx, secattr); out: rcu_read_unlock(); return rc; } #endif /* CONFIG_NETLABEL */ /** * __security_read_policy - read the policy. * @policy: SELinux policy * @data: binary policy data * @len: length of data in bytes * */ static int __security_read_policy(struct selinux_policy *policy, void *data, size_t *len) { int rc; struct policy_file fp; fp.data = data; fp.len = *len; rc = policydb_write(&policy->policydb, &fp); if (rc) return rc; *len = (unsigned long)fp.data - (unsigned long)data; return 0; } /** * security_read_policy - read the policy. * @data: binary policy data * @len: length of data in bytes * */ int security_read_policy(void **data, size_t *len) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( state->policy, lockdep_is_held(&state->policy_mutex)); if (!policy) return -EINVAL; *len = policy->policydb.len; *data = vmalloc_user(*len); if (!*data) return -ENOMEM; return __security_read_policy(policy, *data, len); } /** * security_read_state_kernel - read the policy. * @data: binary policy data * @len: length of data in bytes * * Allocates kernel memory for reading SELinux policy. * This function is for internal use only and should not * be used for returning data to user space. * * This function must be called with policy_mutex held. */ int security_read_state_kernel(void **data, size_t *len) { int err; struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( state->policy, lockdep_is_held(&state->policy_mutex)); if (!policy) return -EINVAL; *len = policy->policydb.len; *data = vmalloc(*len); if (!*data) return -ENOMEM; err = __security_read_policy(policy, *data, len); if (err) { vfree(*data); *data = NULL; *len = 0; } return err; } |
45 10 35 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2023 ARM Ltd. */ #include <linux/mm.h> #include <linux/efi.h> #include <linux/export.h> #include <asm/tlbflush.h> static inline bool mm_is_user(struct mm_struct *mm) { /* * Don't attempt to apply the contig bit to kernel mappings, because * dynamically adding/removing the contig bit can cause page faults. * These racing faults are ok for user space, since they get serialized * on the PTL. But kernel mappings can't tolerate faults. */ if (unlikely(mm_is_efi(mm))) return false; return mm != &init_mm; } static inline pte_t *contpte_align_down(pte_t *ptep) { return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { /* * Unfold any partially covered contpte block at the beginning and end * of the range. */ if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); if (ptep + nr != contpte_align_down(ptep + nr)) { unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); pte_t *last_ptep = ptep + nr - 1; contpte_try_unfold(mm, last_addr, last_ptep, __ptep_get(last_ptep)); } } static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long start_addr; pte_t *start_ptep; int i; start_ptep = ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); if (pte_dirty(ptent)) pte = pte_mkdirty(pte); if (pte_young(ptent)) pte = pte_mkyoung(pte); } __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * We have already checked that the virtual and pysical addresses are * correctly aligned for a contpte mapping in contpte_try_fold() so the * remaining checks are to ensure that the contpte range is fully * covered by a single folio, and ensure that all the ptes are valid * with contiguous PFNs and matching prots. We ignore the state of the * access and dirty bits for the purpose of deciding if its a contiguous * range; the folding process will generate a single contpte entry which * has a single access and dirty bit. Those 2 bits are the logical OR of * their respective bits in the constituent pte entries. In order to * ensure the contpte range is covered by a single folio, we must * recover the folio from the pfn, but special mappings don't have a * folio backing them. Fortunately contpte_try_fold() already checked * that the pte is not special - we never try to fold special mappings. * Note we can't use vm_normal_page() for this since we don't have the * vma. */ unsigned long folio_start, folio_end; unsigned long cont_start, cont_end; pte_t expected_pte, subpte; struct folio *folio; struct page *page; unsigned long pfn; pte_t *orig_ptep; pgprot_t prot; int i; if (!mm_is_user(mm)) return; page = pte_page(pte); folio = page_folio(page); folio_start = addr - (page - &folio->page) * PAGE_SIZE; folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); cont_end = cont_start + CONT_PTE_SIZE; if (folio_start > cont_start || folio_end < cont_end) return; pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); expected_pte = pfn_pte(pfn, prot); orig_ptep = ptep; ptep = contpte_align_down(ptep); for (i = 0; i < CONT_PTES; i++) { subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); if (!pte_same(subpte, expected_pte)) return; expected_pte = pte_advance_pfn(expected_pte, 1); ptep++; } pte = pte_mkcont(pte); contpte_convert(mm, addr, orig_ptep, pte); } EXPORT_SYMBOL_GPL(__contpte_try_fold); void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * We have already checked that the ptes are contiguous in * contpte_try_unfold(), so just check that the mm is user space. */ if (!mm_is_user(mm)) return; pte = pte_mknoncont(pte); contpte_convert(mm, addr, ptep, pte); } EXPORT_SYMBOL_GPL(__contpte_try_unfold); pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) { /* * Gather access/dirty bits, which may be populated in any of the ptes * of the contig range. We are guaranteed to be holding the PTL, so any * contiguous range cannot be unfolded or otherwise modified under our * feet. */ pte_t pte; int i; ptep = contpte_align_down(ptep); for (i = 0; i < CONT_PTES; i++, ptep++) { pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get); pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* * The ptep_get_lockless() API requires us to read and return *orig_ptep * so that it is self-consistent, without the PTL held, so we may be * racing with other threads modifying the pte. Usually a READ_ONCE() * would suffice, but for the contpte case, we also need to gather the * access and dirty bits from across all ptes in the contiguous block, * and we can't read all of those neighbouring ptes atomically, so any * contiguous range may be unfolded/modified/refolded under our feet. * Therefore we ensure we read a _consistent_ contpte range by checking * that all ptes in the range are valid and have CONT_PTE set, that all * pfns are contiguous and that all pgprots are the same (ignoring * access/dirty). If we find a pte that is not consistent, then we must * be racing with an update so start again. If the target pte does not * have CONT_PTE set then that is considered consistent on its own * because it is not part of a contpte range. */ pgprot_t orig_prot; unsigned long pfn; pte_t orig_pte; pgprot_t prot; pte_t *ptep; pte_t pte; int i; retry: orig_pte = __ptep_get(orig_ptep); if (!pte_valid_cont(orig_pte)) return orig_pte; orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); ptep = contpte_align_down(orig_ptep); pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { pte = __ptep_get(ptep); prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); if (!pte_valid_cont(pte) || pte_pfn(pte) != pfn || pgprot_val(prot) != pgprot_val(orig_prot)) goto retry; if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned long next; unsigned long end; unsigned long pfn; pgprot_t prot; /* * The set_ptes() spec guarantees that when nr > 1, the initial state of * all ptes is not-present. Therefore we never need to unfold or * otherwise invalidate a range before we set the new ptes. * contpte_set_ptes() should never be called for nr < 2. */ VM_WARN_ON(nr == 1); if (!mm_is_user(mm)) return __set_ptes(mm, addr, ptep, pte, nr); end = addr + (nr << PAGE_SHIFT); pfn = pte_pfn(pte); prot = pte_pgprot(pte); do { next = pte_cont_addr_end(addr, end); nr = (next - addr) >> PAGE_SHIFT; pte = pfn_pte(pfn, prot); if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) pte = pte_mkcont(pte); else pte = pte_mknoncont(pte); __set_ptes(mm, addr, ptep, pte, nr); addr = next; ptep += nr; pfn += nr; } while (addr != end); } EXPORT_SYMBOL_GPL(contpte_set_ptes); void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { contpte_try_unfold_partial(mm, addr, ptep, nr); __clear_full_ptes(mm, addr, ptep, nr, full); } EXPORT_SYMBOL_GPL(contpte_clear_full_ptes); pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { contpte_try_unfold_partial(mm, addr, ptep, nr); return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * ptep_clear_flush_young() technically requires us to clear the access * flag for a _single_ pte. However, the core-mm code actually tracks * access/dirty per folio, not per page. And since we only create a * contig range when the range is covered by a single folio, we can get * away with clearing young for the whole contig range here, so we avoid * having to unfold. */ int young = 0; int i; ptep = contpte_align_down(ptep); addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) young |= __ptep_test_and_clear_young(vma, addr, ptep); return young; } EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int young; young = contpte_ptep_test_and_clear_young(vma, addr, ptep); if (young) { /* * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, PAGE_SIZE, true, 3); } return young; } EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { /* * If wrprotecting an entire contig range, we can avoid unfolding. Just * set wrprotect and wait for the later mmu_gather flush to invalidate * the tlb. Until the flush, the page may or may not be wrprotected. * After the flush, it is guaranteed wrprotected. If it's a partial * range though, we must unfold, because we can't have a case where * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this * would cause it to continue to be unpredictable after the flush. */ contpte_try_unfold_partial(mm, addr, ptep, nr); __wrprotect_ptes(mm, addr, ptep, nr); } EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { /* * We can safely clear access/dirty without needing to unfold from * the architectures perspective, even when contpte is set. If the * range starts or ends midway through a contpte block, we can just * expand to include the full contpte block. While this is not * exactly what the core-mm asked for, it tracks access/dirty per * folio, not per page. And since we only create a contpte block * when it is covered by a single folio, we can get away with * clearing access/dirty for the whole block. */ unsigned long start = addr; unsigned long end = start + nr * PAGE_SIZE; if (pte_cont(__ptep_get(ptep + nr - 1))) end = ALIGN(end, CONT_PTE_SIZE); if (pte_cont(__ptep_get(ptep))) { start = ALIGN_DOWN(start, CONT_PTE_SIZE); ptep = contpte_align_down(ptep); } __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) { unsigned long start_addr; pte_t orig_pte; int i; /* * Gather the access/dirty bits for the contiguous range. If nothing has * changed, its a noop. */ orig_pte = pte_mknoncont(ptep_get(ptep)); if (pte_val(orig_pte) == pte_val(entry)) return 0; /* * We can fix up access/dirty bits without having to unfold the contig * range. But if the write bit is changing, we must unfold. */ if (pte_write(orig_pte) == pte_write(entry)) { /* * For HW access management, we technically only need to update * the flag on a single pte in the range. But for SW access * management, we need to update all the ptes to prevent extra * faults. Avoid per-page tlb flush in __ptep_set_access_flags() * and instead flush the whole range at the end. */ ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); /* * We are not advancing entry because __ptep_set_access_flags() * only consumes access flags from entry. And since we have checked * for the whole contpte block and returned early, pte_same() * within __ptep_set_access_flags() is likely false. */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); if (dirty) __flush_tlb_range(vma, start_addr, addr, PAGE_SIZE, true, 3); } else { __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); __ptep_set_access_flags(vma, addr, ptep, entry, dirty); } return 1; } EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags); |
335 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 | /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ __u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to avoid freeing per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) return; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) return; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (!has_branch_stack(event)) return; if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) return; if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */ |
276 280 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpuobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref tag; #endif }; #if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; static inline bool need_pcpuobj_ext(void) { if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) return true; if (!mem_cgroup_kmem_disabled()) return true; return false; } extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif |
12 12 12 12 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Skb ref helpers. * */ #ifndef _LINUX_SKBUFF_REF_H #define _LINUX_SKBUFF_REF_H #include <linux/skbuff.h> /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_page(skb_frag_page(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(netmem_ref netmem); static inline void skb_page_unref(netmem_ref netmem, bool recycle) { #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(netmem)) return; #endif put_page(netmem_to_page(netmem)); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { skb_page_unref(skb_frag_netmem(frag), recycle); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } #endif /* _LINUX_SKBUFF_REF_H */ |
54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015 Linaro Ltd. * Author: Shannon Zhao <shannon.zhao@linaro.org> */ #ifndef __ASM_ARM_KVM_PMU_H #define __ASM_ARM_KVM_PMU_H #include <linux/perf_event.h> #include <linux/perf/arm_pmuv3.h> #define KVM_ARMV8_PMU_MAX_COUNTERS 32 #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { u8 idx; /* index into the pmu->pmc array */ struct perf_event *perf_event; }; struct kvm_pmu_events { u64 events_host; u64 events_guest; }; struct kvm_pmu { struct irq_work overflow_work; struct kvm_pmu_events events; struct kvm_pmc pmc[KVM_ARMV8_PMU_MAX_COUNTERS]; int irq_num; bool created; bool irq_level; }; struct arm_pmu_entry { struct list_head entry; struct arm_pmu *arm_pmu; }; DECLARE_STATIC_KEY_FALSE(kvm_arm_pmu_available); static __always_inline bool kvm_arm_support_pmu_v3(void) { return static_branch_likely(&kvm_arm_pmu_available); } #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx); void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu); int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); struct kvm_pmu_events *kvm_get_pmu_events(void); void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); #define kvm_vcpu_has_pmu(vcpu) \ (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3)) /* * Updates the vcpu's view of the pmu events for this cpu. * Must be called before every vcpu run after disabling interrupts, to ensure * that an interrupt cannot fire and update the structure. */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ if (!has_vhe() && kvm_arm_support_pmu_v3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) u8 kvm_arm_pmu_get_pmuver_limit(void); u64 kvm_pmu_evtyper_mask(struct kvm *kvm); int kvm_arm_set_default_pmu(struct kvm *kvm); u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx); void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; static inline bool kvm_arm_support_pmu_v3(void) { return false; } #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { return 0; } static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} static inline u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { return 0; } static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) { return 0; } static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { return false; } static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) {} static inline int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { return -ENXIO; } static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { return 0; } static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { return 0; } #define kvm_vcpu_has_pmu(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) {} static inline u8 kvm_arm_pmu_get_pmuver_limit(void) { return 0; } static inline u64 kvm_pmu_evtyper_mask(struct kvm *kvm) { return 0; } static inline void kvm_vcpu_pmu_resync_el0(void) {} static inline int kvm_arm_set_default_pmu(struct kvm *kvm) { return -ENODEV; } static inline u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { return 0; } static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { return 0; } static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) { return false; } static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {} #endif #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | /* * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #ifndef _LINUX_HUGETLB_CGROUP_H #define _LINUX_HUGETLB_CGROUP_H #include <linux/mmdebug.h> struct hugetlb_cgroup; struct resv_map; struct file_region; #ifdef CONFIG_CGROUP_HUGETLB enum hugetlb_memory_event { HUGETLB_MAX, HUGETLB_NR_MEMORY_EVENTS, }; struct hugetlb_cgroup_per_node { /* hugetlb usage in pages over all hstates. */ unsigned long usage[HUGE_MAX_HSTATE]; }; struct hugetlb_cgroup { struct cgroup_subsys_state css; /* * the counter to account for hugepages from hugetlb. */ struct page_counter hugepage[HUGE_MAX_HSTATE]; /* * the counter to account for hugepage reservations from hugetlb. */ struct page_counter rsvd_hugepage[HUGE_MAX_HSTATE]; atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; /* Handle for "hugetlb.events" */ struct cgroup_file events_file[HUGE_MAX_HSTATE]; /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (rsvd) return folio->_hugetlb_cgroup_rsvd; else return folio->_hugetlb_cgroup; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return __hugetlb_cgroup_from_folio(folio, true); } static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (rsvd) folio->_hugetlb_cgroup_rsvd = h_cg; else folio->_hugetlb_cgroup = h_cg; } static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { __set_hugetlb_cgroup(folio, h_cg, false); } static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) { return !cgroup_subsys_enabled(hugetlb_cgrp_subsys); } static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { css_put(&h_cg->css); } static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { if (resv_map->css) css_get(resv_map->css); } static inline void resv_map_put_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { if (resv_map->css) css_put(resv_map->css); } extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end); extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } static inline bool hugetlb_cgroup_disabled(void) { return true; } static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { } static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { } static inline void resv_map_put_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { } static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return 0; } static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return 0; } static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { } static inline void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { } static inline void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { } static inline void hugetlb_cgroup_file_init(void) { } static inline void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { } #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif |
172 108 704 10 704 380 107 614 8 617 618 705 394 616 189 134 134 56 56 74 54 54 7 178 35 35 4 16 15 1 1 1 1 16 16 15 3 3 16 178 18 35 7 36 35 35 35 7 34 35 190 178 179 179 179 178 43 190 26 26 35 54 74 66 16 217 190 66 217 217 181 74 187 66 217 217 67 16 16 16 74 10 10 10 79 14 71 16 71 50 10 74 327 93 46 328 59 59 14 16 3 15 4 4 4 109 109 7 21 21 5 19 635 637 637 56 8 56 56 56 56 116 116 115 116 10 106 97 97 97 93 20 20 97 94 20 24 25 25 24 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 | // SPDX-License-Identifier: GPL-2.0+ /* * XArray implementation * Copyright (c) 2017-2018 Microsoft Corporation * Copyright (c) 2018-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ #include <linux/bitmap.h> #include <linux/export.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Coding conventions in this file: * * @xa is used to refer to the entire xarray. * @xas is the 'xarray operation state'. It may be either a pointer to * an xa_state, or an xa_state stored on the stack. This is an unfortunate * ambiguity. * @index is the index of the entry being operated on * @mark is an xa_mark_t; a small number indicating one of the mark bits. * @node refers to an xa_node; usually the primary one being operated on by * this function. * @offset is the index into the slots array inside an xa_node. * @parent refers to the @xa_node closer to the head than @node. * @entry refers to something stored in a slot in the xarray */ static inline unsigned int xa_lock_type(const struct xarray *xa) { return (__force unsigned int)xa->xa_flags & 3; } static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_lock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_lock_bh(xas); else xas_lock(xas); } static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_unlock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_unlock_bh(xas); else xas_unlock(xas); } static inline bool xa_track_free(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_TRACK_FREE; } static inline bool xa_zero_busy(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_ZERO_BUSY; } static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) xa->xa_flags |= XA_FLAGS_MARK(mark); } static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) { if (xa->xa_flags & XA_FLAGS_MARK(mark)) xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); } static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) { return node->marks[(__force unsigned)mark]; } static inline bool node_get_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return test_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_set_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_set_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_clear_bit(offset, node_marks(node, mark)); } static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) { return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) { bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); } #define mark_inc(mark) do { \ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ } while (0) /* * xas_squash_marks() - Merge all marks to the first entry * @xas: Array operation state. * * Set a mark on the first entry if any entry has it set. Clear marks on * all sibling entries. */ static void xas_squash_marks(const struct xa_state *xas) { xa_mark_t mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; for (;;) { unsigned long *marks = node_marks(xas->xa_node, mark); if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) { __set_bit(xas->xa_offset, marks); bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { return (index >> node->shift) & XA_CHUNK_MASK; } static void xas_set_offset(struct xa_state *xas) { xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); } /* move the index either forwards (find) or backwards (sibling slot) */ static void xas_move_index(struct xa_state *xas, unsigned long offset) { unsigned int shift = xas->xa_node->shift; xas->xa_index &= ~XA_CHUNK_MASK << shift; xas->xa_index += offset << shift; } static void xas_next_offset(struct xa_state *xas) { xas->xa_offset++; xas_move_index(xas, xas->xa_offset); } static void *set_bounds(struct xa_state *xas) { xas->xa_node = XAS_BOUNDS; return NULL; } /* * Starts a walk. If the @xas is already valid, we assume that it's on * the right path and just return where we've got to. If we're in an * error state, return NULL. If the index is outside the current scope * of the xarray, return NULL without changing @xas->xa_node. Otherwise * set @xas->xa_node to NULL and return the current head of the array. */ static void *xas_start(struct xa_state *xas) { void *entry; if (xas_valid(xas)) return xas_reload(xas); if (xas_error(xas)) return NULL; entry = xa_head(xas->xa); if (!xa_is_node(entry)) { if (xas->xa_index) return set_bounds(xas); } else { if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) return set_bounds(xas); } xas->xa_node = NULL; return entry; } static __always_inline void *xas_descend(struct xa_state *xas, struct xa_node *node) { unsigned int offset = get_offset(xas->xa_index, node); void *entry = xa_entry(xas->xa, node, offset); xas->xa_node = node; while (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; return entry; } /** * xas_load() - Load an entry from the XArray (advanced). * @xas: XArray operation state. * * Usually walks the @xas to the appropriate state to load the entry * stored at xa_index. However, it will do nothing and return %NULL if * @xas is in an error state. xas_load() will never expand the tree. * * If the xa_state is set up to operate on a multi-index entry, xas_load() * may return %NULL or an internal entry, even if there are entries * present within the range specified by @xas. * * Context: Any context. The caller should hold the xa_lock or the RCU lock. * Return: Usually an entry in the XArray, but see description for exceptions. */ void *xas_load(struct xa_state *xas) { void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry; } EXPORT_SYMBOL_GPL(xas_load); #define XA_RCU_FREE ((struct xarray *)1) static void xa_node_free(struct xa_node *node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->array = XA_RCU_FREE; call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * * Most users will not need to call this function; it is called for you * by xas_nomem(). */ void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; while (node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); next = rcu_dereference_raw(node->parent); radix_tree_node_rcu_free(&node->rcu_head); xas->xa_alloc = node = next; } } /** * xas_nomem() - Allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * If we need to add new nodes to the XArray, we try to allocate memory * with GFP_NOWAIT while holding the lock, which will usually succeed. * If it fails, @xas is flagged as needing memory to continue. The caller * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, * the caller should retry the operation. * * Forward progress is guaranteed as one node is allocated here and * stored in the xa_state where it will be found by xas_alloc(). More * nodes will likely be found in the slab allocator, but we do not tie * them up here. * * Return: true if memory was needed, and was successfully allocated. */ bool xas_nomem(struct xa_state *xas, gfp_t gfp) { if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } EXPORT_SYMBOL_GPL(xas_nomem); /* * __xas_nomem() - Drop locks and allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * Internal variant of xas_nomem(). * * Return: true if memory was needed, and was successfully allocated. */ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) __must_hold(xas->xa->xa_lock) { unsigned int lock_type = xa_lock_type(xas->xa); if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; if (gfpflags_allow_blocking(gfp)) { xas_unlock_type(xas, lock_type); xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); xas_lock_type(xas, lock_type); } else { xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); } if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } static void xas_update(struct xa_state *xas, struct xa_node *node) { if (xas->xa_update) xas->xa_update(node); else XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); } static void *xas_alloc(struct xa_state *xas, unsigned int shift) { struct xa_node *parent = xas->xa_node; struct xa_node *node = xas->xa_alloc; if (xas_invalid(xas)) return NULL; if (node) { xas->xa_alloc = NULL; } else { gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) { xas_set_err(xas, -ENOMEM); return NULL; } } if (parent) { node->offset = xas->xa_offset; parent->count++; XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); xas_update(xas, parent); } XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->shift = shift; node->count = 0; node->nr_values = 0; RCU_INIT_POINTER(node->parent, xas->xa_node); node->array = xas->xa; return node; } #ifdef CONFIG_XARRAY_MULTI /* Returns the number of indices covered by a given xa_state */ static unsigned long xas_size(const struct xa_state *xas) { return (xas->xa_sibs + 1UL) << xas->xa_shift; } #endif /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) { unsigned long max = xas->xa_index; #ifdef CONFIG_XARRAY_MULTI if (xas->xa_shift || xas->xa_sibs) { unsigned long mask = xas_size(xas) - 1; max |= mask; if (mask == max) max++; } #endif return max; } /* The maximum index that can be contained in the array without expanding it */ static unsigned long max_index(void *entry) { if (!xa_is_node(entry)) return 0; return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } static inline void *xa_zero_to_null(void *entry) { return xa_is_zero(entry) ? NULL : entry; } static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; struct xa_node *node = xas->xa_node; for (;;) { void *entry; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count != 1) break; entry = xa_entry_locked(xa, node, 0); if (!entry) break; if (!xa_is_node(entry) && node->shift) break; if (xa_zero_busy(xa)) entry = xa_zero_to_null(entry); xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) xa_mark_clear(xa, XA_FREE_MARK); node->count = 0; node->nr_values = 0; if (!xa_is_node(entry)) RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); xas_update(xas, node); xa_node_free(node); if (!xa_is_node(entry)) break; node = xa_to_node(entry); node->parent = NULL; } } /* * xas_delete_node() - Attempt to delete an xa_node * @xas: Array operation state. * * Attempts to delete the @xas->xa_node. This will fail if xa->node has * a non-zero reference count. */ static void xas_delete_node(struct xa_state *xas) { struct xa_node *node = xas->xa_node; for (;;) { struct xa_node *parent; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count) break; parent = xa_parent_locked(xas->xa, node); xas->xa_node = parent; xas->xa_offset = node->offset; xa_node_free(node); if (!parent) { xas->xa->xa_head = NULL; xas->xa_node = XAS_BOUNDS; return; } parent->slots[xas->xa_offset] = NULL; parent->count--; XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); node = parent; xas_update(xas, node); } if (!node->parent) xas_shrink(xas); } /** * xas_free_nodes() - Free this node and all nodes that it references * @xas: Array operation state. * @top: Node to free * * This node has been removed from the tree. We must now free it and all * of its subnodes. There may be RCU walkers with references into the tree, * so we must replace all entries with retry markers. */ static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) { unsigned int offset = 0; struct xa_node *node = top; for (;;) { void *entry = xa_entry_locked(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) { node = xa_to_node(entry); offset = 0; continue; } if (entry) RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); offset++; while (offset == XA_CHUNK_SIZE) { struct xa_node *parent; parent = xa_parent_locked(xas->xa, node); offset = node->offset + 1; node->count = 0; node->nr_values = 0; xas_update(xas, node); xa_node_free(node); if (node == top) return; node = parent; } } } /* * xas_expand adds nodes to the head of the tree until it has reached * sufficient height to be able to contain @xas->xa_index */ static int xas_expand(struct xa_state *xas, void *head) { struct xarray *xa = xas->xa; struct xa_node *node = NULL; unsigned int shift = 0; unsigned long max = xas_max(xas); if (!head) { if (max == 0) return 0; while ((max >> shift) >= XA_CHUNK_SIZE) shift += XA_CHUNK_SHIFT; return shift + XA_CHUNK_SHIFT; } else if (xa_is_node(head)) { node = xa_to_node(head); shift = node->shift + XA_CHUNK_SHIFT; } xas->xa_node = NULL; while (max > max_index(head)) { xa_mark_t mark = 0; XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); node = xas_alloc(xas, shift); if (!node) return -ENOMEM; node->count = 1; if (xa_is_value(head)) node->nr_values = 1; RCU_INIT_POINTER(node->slots[0], head); /* Propagate the aggregated mark info to the new child */ for (;;) { if (xa_track_free(xa) && mark == XA_FREE_MARK) { node_mark_all(node, XA_FREE_MARK); if (!xa_marked(xa, XA_FREE_MARK)) { node_clear_mark(node, 0, XA_FREE_MARK); xa_mark_set(xa, XA_FREE_MARK); } } else if (xa_marked(xa, mark)) { node_set_mark(node, 0, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } /* * Now that the new node is fully initialised, we can add * it to the tree */ if (xa_is_node(head)) { xa_to_node(head)->offset = 0; rcu_assign_pointer(xa_to_node(head)->parent, node); } head = xa_mk_node(node); rcu_assign_pointer(xa->xa_head, head); xas_update(xas, node); shift += XA_CHUNK_SHIFT; } xas->xa_node = node; return shift; } /* * xas_create() - Create a slot to store an entry in. * @xas: XArray operation state. * @allow_root: %true if we can store the entry in the root directly * * Most users will not need to call this function directly, as it is called * by xas_store(). It is useful for doing conditional store operations * (see the xa_cmpxchg() implementation for an example). * * Return: If the slot already existed, returns the contents of this slot. * If the slot was newly created, returns %NULL. If it failed to create the * slot, returns %NULL and indicates the error in @xas. */ static void *xas_create(struct xa_state *xas, bool allow_root) { struct xarray *xa = xas->xa; void *entry; void __rcu **slot; struct xa_node *node = xas->xa_node; int shift; unsigned int order = xas->xa_shift; if (xas_top(node)) { entry = xa_head_locked(xa); xas->xa_node = NULL; if (!entry && xa_zero_busy(xa)) entry = XA_ZERO_ENTRY; shift = xas_expand(xas, entry); if (shift < 0) return NULL; if (!shift && !allow_root) shift = XA_CHUNK_SHIFT; entry = xa_head_locked(xa); slot = &xa->xa_head; } else if (xas_error(xas)) { return NULL; } else if (node) { unsigned int offset = xas->xa_offset; shift = node->shift; entry = xa_entry_locked(xa, node, offset); slot = &node->slots[offset]; } else { shift = 0; entry = xa_head_locked(xa); slot = &xa->xa_head; } while (shift > order) { shift -= XA_CHUNK_SHIFT; if (!entry) { node = xas_alloc(xas, shift); if (!node) break; if (xa_track_free(xa)) node_mark_all(node, XA_FREE_MARK); rcu_assign_pointer(*slot, xa_mk_node(node)); } else if (xa_is_node(entry)) { node = xa_to_node(entry); } else { break; } entry = xas_descend(xas, node); slot = &node->slots[xas->xa_offset]; } return entry; } /** * xas_create_range() - Ensure that stores to this range will succeed * @xas: XArray operation state. * * Creates all of the slots in the range covered by @xas. Sets @xas to * create single-index entries and positions it at the beginning of the * range. This is for the benefit of users which have not yet been * converted to use multi-index entries. */ void xas_create_range(struct xa_state *xas) { unsigned long index = xas->xa_index; unsigned char shift = xas->xa_shift; unsigned char sibs = xas->xa_sibs; xas->xa_index |= ((sibs + 1UL) << shift) - 1; if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) xas->xa_offset |= sibs; xas->xa_shift = 0; xas->xa_sibs = 0; for (;;) { xas_create(xas, true); if (xas_error(xas)) goto restore; if (xas->xa_index <= (index | XA_CHUNK_MASK)) goto success; xas->xa_index -= XA_CHUNK_SIZE; for (;;) { struct xa_node *node = xas->xa_node; if (node->shift >= shift) break; xas->xa_node = xa_parent_locked(xas->xa, node); xas->xa_offset = node->offset - 1; if (node->offset != 0) break; } } restore: xas->xa_shift = shift; xas->xa_sibs = sibs; xas->xa_index = index; return; success: xas->xa_index = index; if (xas->xa_node) xas_set_offset(xas); } EXPORT_SYMBOL_GPL(xas_create_range); static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { if (!node || (!count && !values)) return; node->count += count; node->nr_values += values; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); xas_update(xas, node); if (count < 0) xas_delete_node(xas); } /** * xas_store() - Store this entry in the XArray. * @xas: XArray operation state. * @entry: New entry. * * If @xas is operating on a multi-index entry, the entry returned by this * function is essentially meaningless (it may be an internal entry or it * may be %NULL, even if there are non-NULL entries at some of the indices * covered by the range). This is not a problem for any current users, * and can be changed if needed. * * Return: The old entry at this index. */ void *xas_store(struct xa_state *xas, void *entry) { struct xa_node *node; void __rcu **slot = &xas->xa->xa_head; unsigned int offset, max; int count = 0; int values = 0; void *first, *next; bool value = xa_is_value(entry); if (entry) { bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); first = xas_create(xas, allow_root); } else { first = xas_load(xas); } if (xas_invalid(xas)) return first; node = xas->xa_node; if (node && (xas->xa_shift < node->shift)) xas->xa_sibs = 0; if ((first == entry) && !xas->xa_sibs) return first; next = first; offset = xas->xa_offset; max = xas->xa_offset + xas->xa_sibs; if (node) { slot = &node->slots[offset]; if (xas->xa_sibs) xas_squash_marks(xas); } if (!entry) xas_init_marks(xas); for (;;) { /* * Must clear the marks before setting the entry to NULL, * otherwise xas_for_each_marked may find a NULL entry and * stop early. rcu_assign_pointer contains a release barrier * so the mark clearing will appear to happen before the * entry is set to NULL. */ rcu_assign_pointer(*slot, entry); if (xa_is_node(next) && (!node || node->shift)) xas_free_nodes(xas, xa_to_node(next)); if (!node) break; count += !next - !entry; values += !xa_is_value(first) - !value; if (entry) { if (offset == max) break; if (!xa_is_sibling(entry)) entry = xa_mk_sibling(xas->xa_offset); } else { if (offset == XA_CHUNK_MASK) break; } next = xa_entry_locked(xas->xa, node, ++offset); if (!xa_is_sibling(next)) { if (!entry && (offset > max)) break; first = next; } slot++; } update_node(xas, node, count, values); return first; } EXPORT_SYMBOL_GPL(xas_store); /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. * @mark: Mark number. * * Return: true if the mark is set, false if the mark is clear or @xas * is in an error state. */ bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) { if (xas_invalid(xas)) return false; if (!xas->xa_node) return xa_marked(xas->xa, mark); return node_get_mark(xas->xa_node, xas->xa_offset, mark); } EXPORT_SYMBOL_GPL(xas_get_mark); /** * xas_set_mark() - Sets the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Sets the specified mark on this entry, and walks up the tree setting it * on all the ancestor entries. Does nothing if @xas has not been walked to * an entry, or is in an error state. */ void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (node_set_mark(node, offset, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (!xa_marked(xas->xa, mark)) xa_mark_set(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_set_mark); /** * xas_clear_mark() - Clears the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Clears the specified mark on this entry, and walks back to the head * attempting to clear it on all the ancestor entries. Does nothing if * @xas has not been walked to an entry, or is in an error state. */ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (!node_clear_mark(node, offset, mark)) return; if (node_any_mark(node, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (xa_marked(xas->xa, mark)) xa_mark_clear(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_clear_mark); /** * xas_init_marks() - Initialise all marks for the entry * @xas: Array operations state. * * Initialise all marks for the entry specified by @xas. If we're tracking * free entries with a mark, we need to set it on all entries. All other * marks are cleared. * * This implementation is not as efficient as it could be; we may walk * up the tree multiple times. */ void xas_init_marks(const struct xa_state *xas) { xa_mark_t mark = 0; for (;;) { if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) xas_set_mark(xas, mark); else xas_clear_mark(xas, mark); if (mark == XA_MARK_MAX) break; mark_inc(mark); } } EXPORT_SYMBOL_GPL(xas_init_marks); #ifdef CONFIG_XARRAY_MULTI static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) { unsigned int marks = 0; xa_mark_t mark = XA_MARK_0; for (;;) { if (node_get_mark(node, offset, mark)) marks |= 1 << (__force unsigned int)mark; if (mark == XA_MARK_MAX) break; mark_inc(mark); } return marks; } static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, xa_mark_t mark) { int i; if (sibs == 0) node_mark_all(node, mark); else { for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) node_set_mark(node, i, mark); } } static void node_set_marks(struct xa_node *node, unsigned int offset, struct xa_node *child, unsigned int sibs, unsigned int marks) { xa_mark_t mark = XA_MARK_0; for (;;) { if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } /** * xas_split_alloc() - Allocate memory for splitting an entry. * @xas: XArray operation state. * @entry: New entry which will be stored in the array. * @order: Current entry order. * @gfp: Memory allocation flags. * * This function should be called before calling xas_split(). * If necessary, it will allocate new nodes (and fill them with @entry) * to prepare for the upcoming split of an entry of @order size into * entries of the order stored in the @xas. * * Context: May sleep if @gfp flags permit. */ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; do { unsigned int i; void *sibling = NULL; struct xa_node *node; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) goto nomem; node->array = xas->xa; for (i = 0; i < XA_CHUNK_SIZE; i++) { if ((i & mask) == 0) { RCU_INIT_POINTER(node->slots[i], entry); sibling = xa_mk_sibling(i); } else { RCU_INIT_POINTER(node->slots[i], sibling); } } RCU_INIT_POINTER(node->parent, xas->xa_alloc); xas->xa_alloc = node; } while (sibs-- > 0); return; nomem: xas_destroy(xas); xas_set_err(xas, -ENOMEM); } EXPORT_SYMBOL_GPL(xas_split_alloc); /** * xas_split() - Split a multi-index entry into smaller entries. * @xas: XArray operation state. * @entry: New entry to store in the array. * @order: Current entry order. * * The size of the new entries is set in @xas. The value in @entry is * copied to all the replacement entries. * * Context: Any context. The caller should hold the xa_lock. */ void xas_split(struct xa_state *xas, void *entry, unsigned int order) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int offset, marks; struct xa_node *node; void *curr = xas_load(xas); int values = 0; node = xas->xa_node; if (xas_top(node)) return; marks = node_get_marks(node, xas->xa_offset); offset = xas->xa_offset + sibs; do { if (xas->xa_shift < node->shift) { struct xa_node *child = xas->xa_alloc; xas->xa_alloc = rcu_dereference_raw(child->parent); child->shift = node->shift - XA_CHUNK_SHIFT; child->offset = offset; child->count = XA_CHUNK_SIZE; child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); node_set_marks(node, offset, child, xas->xa_sibs, marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) values--; xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], xa_mk_sibling(canon)); values += (xa_is_value(entry) - xa_is_value(curr)) * (xas->xa_sibs + 1); } } while (offset-- > xas->xa_offset); node->nr_values += values; xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); #endif /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @xas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call xas_pause(), the xa_for_each() * iterator may be more appropriate. * * Note that xas_pause() only works for forward iteration. If a user needs * to pause a reverse iteration, we will need a xas_pause_rev(). */ void xas_pause(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (xas_invalid(xas)) return; xas->xa_node = XAS_RESTART; if (node) { unsigned long offset = xas->xa_offset; while (++offset < XA_CHUNK_SIZE) { if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } xas->xa_index &= ~0UL << node->shift; xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; } else { xas->xa_index++; } } EXPORT_SYMBOL_GPL(xas_pause); /* * __xas_prev() - Find the previous entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_prev() which handles all the complex cases * out of line. */ void *__xas_prev(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index--; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset--; while (xas->xa_offset == 255) { xas->xa_offset = xas->xa_node->offset - 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_prev); /* * __xas_next() - Find the next entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_next() which handles all the complex cases * out of line. */ void *__xas_next(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index++; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset++; while (xas->xa_offset == XA_CHUNK_SIZE) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_next); /** * xas_find() - Find the next present entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * * If the @xas has not yet been walked to an entry, return the entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we move to the * next entry. * * If no entry is found and the array is smaller than @max, the iterator * is set to the smallest index not yet in the array. This allows @xas * to be immediately passed to xas_store(). * * Return: The entry, if found, otherwise %NULL. */ void *xas_find(struct xa_state *xas, unsigned long max) { void *entry; if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) return NULL; if (xas->xa_index > max) return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1; return set_bounds(xas); } else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node)) return entry; } else if (!xas->xa_node->shift && xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; } xas_next_offset(xas); while (xas->xa_node && (xas->xa_index <= max)) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_node(entry)) { xas->xa_node = xa_to_node(entry); xas->xa_offset = 0; continue; } if (entry && !xa_is_sibling(entry)) return entry; xas_next_offset(xas); } if (!xas->xa_node) xas->xa_node = XAS_BOUNDS; return NULL; } EXPORT_SYMBOL_GPL(xas_find); /** * xas_find_marked() - Find the next marked entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark number to search for. * * If the @xas has not yet been walked to an entry, return the marked entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we return the * first marked entry with an index > xas.xa_index. * * If no marked entry is found and the array is smaller than @max, @xas is * set to the bounds state and xas->xa_index is set to the smallest index * not yet in the array. This allows @xas to be immediately passed to * xas_store(). * * If no entry is found before @max is reached, @xas is set to the restart * state. * * Return: The entry, if found, otherwise %NULL. */ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { bool advance = true; unsigned int offset; void *entry; if (xas_error(xas)) return NULL; if (xas->xa_index > max) goto max; if (!xas->xa_node) { xas->xa_index = 1; goto out; } else if (xas_top(xas->xa_node)) { advance = false; entry = xa_head(xas->xa); xas->xa_node = NULL; if (xas->xa_index > max_index(entry)) goto out; if (!xa_is_node(entry)) { if (xa_marked(xas->xa, mark)) return entry; xas->xa_index = 1; goto out; } xas->xa_node = xa_to_node(entry); xas->xa_offset = xas->xa_index >> xas->xa_node->shift; } while (xas->xa_index <= max) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) break; advance = false; continue; } if (!advance) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_sibling(entry)) { xas->xa_offset = xa_to_sibling(entry); xas_move_index(xas, xas->xa_offset); } } offset = xas_find_chunk(xas, advance, mark); if (offset > xas->xa_offset) { advance = false; xas_move_index(xas, offset); /* Mind the wrap */ if ((xas->xa_index - 1) >= max) goto max; xas->xa_offset = offset; if (offset == XA_CHUNK_SIZE) continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; if (xa_is_sibling(entry)) continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } out: if (xas->xa_index > max) goto max; return set_bounds(xas); max: xas->xa_node = XAS_RESTART; return NULL; } EXPORT_SYMBOL_GPL(xas_find_marked); /** * xas_find_conflict() - Find the next present entry in a range. * @xas: XArray operation state. * * The @xas describes both a range and a position within that range. * * Context: Any context. Expects xa_lock to be held. * Return: The next entry in the range covered by @xas or %NULL. */ void *xas_find_conflict(struct xa_state *xas) { void *curr; if (xas_error(xas)) return NULL; if (!xas->xa_node) return NULL; if (xas_top(xas->xa_node)) { curr = xas_start(xas); if (!curr) return NULL; while (xa_is_node(curr)) { struct xa_node *node = xa_to_node(curr); curr = xas_descend(xas, node); } if (curr) return curr; } if (xas->xa_node->shift > xas->xa_shift) return NULL; for (;;) { if (xas->xa_node->shift == xas->xa_shift) { if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) break; } else if (xas->xa_offset == XA_CHUNK_MASK) { xas->xa_offset = xas->xa_node->offset; xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); if (!xas->xa_node) break; continue; } curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); if (xa_is_sibling(curr)) continue; while (xa_is_node(curr)) { xas->xa_node = xa_to_node(curr); xas->xa_offset = 0; curr = xa_entry_locked(xas->xa, xas->xa_node, 0); } if (curr) return curr; } xas->xa_offset -= xas->xa_sibs; return NULL; } EXPORT_SYMBOL_GPL(xas_find_conflict); /** * xa_load() - Load an entry from an XArray. * @xa: XArray. * @index: index into array. * * Context: Any context. Takes and releases the RCU lock. * Return: The entry at @index in @xa. */ void *xa_load(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); do { entry = xa_zero_to_null(xas_load(&xas)); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return entry; } EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { if (xas_error(xas)) curr = xas->xa_node; return curr; } /** * __xa_erase() - Erase this entry from the XArray while locked. * @xa: XArray. * @index: Index into array. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Expects xa_lock to be held on entry. * Return: The entry which used to be at this index. */ void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL))); } EXPORT_SYMBOL(__xa_erase); /** * xa_erase() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock. * Return: The entry which used to be at this index. */ void *xa_erase(struct xarray *xa, unsigned long index) { void *entry; xa_lock(xa); entry = __xa_erase(xa, index); xa_unlock(xa); return entry; } EXPORT_SYMBOL(xa_erase); /** * __xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); if (xa_track_free(xa) && !entry) entry = XA_ZERO_ENTRY; do { curr = xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, xa_zero_to_null(curr)); } EXPORT_SYMBOL(__xa_store); /** * xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation * failed. */ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock(xa); return curr; } EXPORT_SYMBOL(xa_store); static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp); /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp)); } EXPORT_SYMBOL(__xa_cmpxchg); static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); do { curr = xas_load(&xas); if (curr == old) { xas_store(&xas, entry); if (xa_track_free(xa) && entry && !curr) xas_clear_mark(&xas, XA_FREE_MARK); } } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } /** * __xa_insert() - Store this entry in the XArray if no entry is present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; int errno; if (!entry) entry = XA_ZERO_ENTRY; curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp); errno = xa_err(curr); if (errno) return errno; return (curr != NULL) ? -EBUSY : 0; } EXPORT_SYMBOL(__xa_insert); #ifdef CONFIG_XARRAY_MULTI static void xas_set_range(struct xa_state *xas, unsigned long first, unsigned long last) { unsigned int shift = 0; unsigned long sibs = last - first; unsigned int offset = XA_CHUNK_MASK; xas_set(xas, first); while ((first & XA_CHUNK_MASK) == 0) { if (sibs < XA_CHUNK_MASK) break; if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) break; shift += XA_CHUNK_SHIFT; if (offset == XA_CHUNK_MASK) offset = sibs & XA_CHUNK_MASK; sibs >>= XA_CHUNK_SHIFT; first >>= XA_CHUNK_SHIFT; } offset = first & XA_CHUNK_MASK; if (offset + sibs > XA_CHUNK_MASK) sibs = XA_CHUNK_MASK - offset; if ((((first + sibs + 1) << shift) - 1) > last) sibs -= 1; xas->xa_shift = shift; xas->xa_sibs = sibs; } /** * xa_store_range() - Store this entry at a range of indices in the XArray. * @xa: XArray. * @first: First index to affect. * @last: Last index to affect. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in * an XArray, or xa_err(-ENOMEM) if memory allocation failed. */ void *xa_store_range(struct xarray *xa, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_internal(entry))) return XA_ERROR(-EINVAL); if (last < first) return XA_ERROR(-EINVAL); do { xas_lock(&xas); if (entry) { unsigned int order = BITS_PER_LONG; if (last + 1) order = __ffs(last + 1); xas_set_order(&xas, last, order); xas_create(&xas, true); if (xas_error(&xas)) goto unlock; } do { xas_set_range(&xas, first, last); xas_store(&xas, entry); if (xas_error(&xas)) goto unlock; first += xas_size(&xas); } while (first <= last); unlock: xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); /** * xas_get_order() - Get the order of an entry. * @xas: XArray operation state. * * Called after xas_load, the xas should not be in an error state. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xas_get_order(struct xa_state *xas) { int order = 0; if (!xas->xa_node) return 0; for (;;) { unsigned int slot = xas->xa_offset + (1 << order); if (slot >= XA_CHUNK_SIZE) break; if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot))) break; order++; } order += xas->xa_node->shift; return order; } EXPORT_SYMBOL_GPL(xas_get_order); /** * xa_get_order() - Get the order of an entry. * @xa: XArray. * @index: Index of the entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xa_get_order(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); int order = 0; void *entry; rcu_read_lock(); entry = xas_load(&xas); if (entry) order = xas_get_order(&xas); rcu_read_unlock(); return order; } EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** * __xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @limit: Range for allocated ID. * @entry: New entry. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ int __xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (WARN_ON_ONCE(!xa_track_free(xa))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { xas.xa_index = limit.min; xas_find_marked(&xas, limit.max, XA_FREE_MARK); if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); else *id = xas.xa_index; xas_store(&xas, entry); xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_alloc); /** * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { u32 min = limit.min; int ret; limit.min = max(min, *next); ret = __xa_alloc(xa, id, entry, limit, gfp); if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && limit.min > min) { limit.min = min; ret = __xa_alloc(xa, id, entry, limit, gfp); if (ret == 0) ret = 1; } if (ret >= 0) { *next = *id + 1; if (*next == 0) xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; } return ret; } EXPORT_SYMBOL(__xa_alloc_cyclic); /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_set_mark(&xas, mark); } EXPORT_SYMBOL(__xa_set_mark); /** * __xa_clear_mark() - Clear this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_clear_mark(&xas, mark); } EXPORT_SYMBOL(__xa_clear_mark); /** * xa_get_mark() - Inquire whether this mark is set on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * This function uses the RCU read lock, so the result may be out of date * by the time it returns. If you need the result to be stable, use a lock. * * Context: Any context. Takes and releases the RCU lock. * Return: True if the entry at @index has this mark set, false if it doesn't. */ bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); entry = xas_start(&xas); while (xas_get_mark(&xas, mark)) { if (!xa_is_node(entry)) goto found; entry = xas_descend(&xas, xa_to_node(entry)); } rcu_read_unlock(); return false; found: rcu_read_unlock(); return true; } EXPORT_SYMBOL(xa_get_mark); /** * xa_set_mark() - Set this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Process context. Takes and releases the xa_lock. */ void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_set_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_set_mark); /** * xa_clear_mark() - Clear this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Clearing a mark always succeeds. * * Context: Process context. Takes and releases the xa_lock. */ void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_clear_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_clear_mark); /** * xa_find() - Search the XArray for an entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter, and has the lowest * index that is at least @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may not find * entries which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The entry, if found, otherwise %NULL. */ void *xa_find(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find); static bool xas_sibling(struct xa_state *xas) { struct xa_node *node = xas->xa_node; unsigned long mask; if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) return false; mask = (XA_CHUNK_SIZE << node->shift) - 1; return (xas->xa_index & mask) > ((unsigned long)xas->xa_offset << node->shift); } /** * xa_find_after() - Search the XArray for a present entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter and has the lowest * index that is above @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may miss entries * which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The pointer, if found, otherwise %NULL. */ void *xa_find_after(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp + 1); void *entry; if (xas.xa_index == 0) return NULL; rcu_read_lock(); for (;;) { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); if (xas_invalid(&xas)) break; if (xas_sibling(&xas)) continue; if (!xas_retry(&xas, entry)) break; } rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find_after); static unsigned int xas_extract_present(struct xa_state *xas, void **dst, unsigned long max, unsigned int n) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each(xas, entry, max) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, unsigned long max, unsigned int n, xa_mark_t mark) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each_marked(xas, entry, max, mark) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } /** * xa_extract() - Copy selected entries from the XArray into a normal array. * @xa: The source XArray to copy from. * @dst: The buffer to copy entries into. * @start: The first index in the XArray eligible to be selected. * @max: The last index in the XArray eligible to be selected. * @n: The maximum number of entries to copy. * @filter: Selection criterion. * * Copies up to @n entries that match @filter from the XArray. The * copied entries will have indices between @start and @max, inclusive. * * The @filter may be an XArray mark value, in which case entries which are * marked with that mark will be copied. It may also be %XA_PRESENT, in * which case all entries which are not %NULL will be copied. * * The entries returned may not represent a snapshot of the XArray at a * moment in time. For example, if another thread stores to index 5, then * index 10, calling xa_extract() may return the old contents of index 5 * and the new contents of index 10. Indices not modified while this * function is running will not be skipped. * * If you need stronger guarantees, holding the xa_lock across calls to this * function will prevent concurrent modification. * * Context: Any context. Takes and releases the RCU lock. * Return: The number of entries copied. */ unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t filter) { XA_STATE(xas, xa, start); if (!n) return 0; if ((__force unsigned int)filter < XA_MAX_MARKS) return xas_extract_marked(&xas, dst, max, n, filter); return xas_extract_present(&xas, dst, max, n); } EXPORT_SYMBOL(xa_extract); /** * xa_delete_node() - Private interface for workingset code. * @node: Node to be removed from the tree. * @update: Function to call to update ancestor nodes. * * Context: xa_lock must be held on entry and will not be released. */ void xa_delete_node(struct xa_node *node, xa_update_node_t update) { struct xa_state xas = { .xa = node->array, .xa_index = (unsigned long)node->offset << (node->shift + XA_CHUNK_SHIFT), .xa_shift = node->shift + XA_CHUNK_SHIFT, .xa_offset = node->offset, .xa_node = xa_parent_locked(node->array, node), .xa_update = update, }; xas_store(&xas, NULL); } EXPORT_SYMBOL_GPL(xa_delete_node); /* For the benefit of the test suite */ /** * xa_destroy() - Free all internal data structures. * @xa: XArray. * * After calling this function, the XArray is empty and has freed all memory * allocated for its internal data structures. You are responsible for * freeing the objects referenced by the XArray. * * Context: Any context. Takes and releases the xa_lock, interrupt-safe. */ void xa_destroy(struct xarray *xa) { XA_STATE(xas, xa, 0); unsigned long flags; void *entry; xas.xa_node = NULL; xas_lock_irqsave(&xas, flags); entry = xa_head_locked(xa); RCU_INIT_POINTER(xa->xa_head, NULL); xas_init_marks(&xas); if (xa_zero_busy(xa)) xa_mark_clear(xa, XA_FREE_MARK); /* lockdep checks we're still holding the lock in xas_free_nodes() */ if (xa_is_node(entry)) xas_free_nodes(&xas, xa_to_node(entry)); xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(xa_destroy); #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { unsigned i, j; if (!node) return; if ((unsigned long)node & 3) { pr_cont("node %px\n", node); return; } pr_cont("node %px %s %d parent %px shift %d count %d values %d " "array %px list %px %px marks", node, node->parent ? "offset" : "max", node->offset, node->parent, node->shift, node->count, node->nr_values, node->array, node->private_list.prev, node->private_list.next); for (i = 0; i < XA_MAX_MARKS; i++) for (j = 0; j < XA_MARK_LONGS; j++) pr_cont(" %lx", node->marks[i][j]); pr_cont("\n"); } void xa_dump_index(unsigned long index, unsigned int shift) { if (!shift) pr_info("%lu: ", index); else if (shift >= BITS_PER_LONG) pr_info("0-%lu: ", ~0UL); else pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); } void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) { if (!entry) return; xa_dump_index(index, shift); if (xa_is_node(entry)) { if (shift == 0) { pr_cont("%px\n", entry); } else { unsigned long i; struct xa_node *node = xa_to_node(entry); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) xa_dump_entry(node->slots[i], index + (i << node->shift), node->shift); } } else if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (!xa_is_internal(entry)) pr_cont("%px\n", entry); else if (xa_is_retry(entry)) pr_cont("retry (%ld)\n", xa_to_internal(entry)); else if (xa_is_sibling(entry)) pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else pr_cont("UNKNOWN ENTRY (%px)\n", entry); } void xa_dump(const struct xarray *xa) { void *entry = xa->xa_head; unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, xa->xa_flags, xa_marked(xa, XA_MARK_0), xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); } #endif |
57 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> #include <uapi/linux/ioprio.h> /* * Default IO priority. */ #define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0) /* * Check that a priority value has a valid class. */ static inline bool ioprio_valid(unsigned short ioprio) { unsigned short class = IOPRIO_PRIO_CLASS(ioprio); return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; } /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (rt_or_dl_task_policy(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } #ifdef CONFIG_BLOCK /* * If the task has set an I/O priority, use that. Otherwise, return * the default I/O priority. * * Expected to be called for current task or with task_lock() held to keep * io_context stable. */ static inline int __get_task_ioprio(struct task_struct *p) { struct io_context *ioc = p->io_context; int prio; if (!ioc) return IOPRIO_DEFAULT; if (p != current) lockdep_assert_held(&p->alloc_lock); prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); return prio; } #else static inline int __get_task_ioprio(struct task_struct *p) { return IOPRIO_DEFAULT; } #endif /* CONFIG_BLOCK */ static inline int get_current_ioprio(void) { return __get_task_ioprio(current); } extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif |
178 97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM hugetlbfs #if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_HUGETLBFS_H #include <linux/tracepoint.h> TRACE_EVENT(hugetlbfs_alloc_inode, TP_PROTO(struct inode *inode, struct inode *dir, int mode), TP_ARGS(inode, dir, mode), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(ino_t, dir) __field(__u16, mode) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir ? dir->i_ino : 0; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); DECLARE_EVENT_CLASS(hugetlbfs__inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(__u16, mode) __field(loff_t, size) __field(unsigned int, nlink) __field(unsigned int, seals) __field(blkcnt_t, blocks) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->size = inode->i_size; __entry->nlink = inode->i_nlink; __entry->seals = HUGETLBFS_I(inode)->seals; __entry->blocks = inode->i_blocks; ), TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->size, __entry->nlink, __entry->seals, (unsigned long long)__entry->blocks) ); DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); TRACE_EVENT(hugetlbfs_setattr, TP_PROTO(struct inode *inode, struct dentry *dentry, struct iattr *attr), TP_ARGS(inode, dentry, attr), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, d_len) __string(d_name, dentry->d_name.name) __field(unsigned int, ia_valid) __field(unsigned int, ia_mode) __field(loff_t, old_size) __field(loff_t, ia_size) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->d_len = dentry->d_name.len; __assign_str(d_name); __entry->ia_valid = attr->ia_valid; __entry->ia_mode = attr->ia_mode; __entry->old_size = inode->i_size; __entry->ia_size = attr->ia_size; ), TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode, __entry->old_size, __entry->ia_size) ); TRACE_EVENT(hugetlbfs_fallocate, TP_PROTO(struct inode *inode, int mode, loff_t offset, loff_t len, int ret), TP_ARGS(inode, mode, offset, len, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, mode) __field(loff_t, offset) __field(loff_t, len) __field(loff_t, size) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = mode; __entry->offset = offset; __entry->len = len; __entry->size = inode->i_size; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->mode, (unsigned long long)__entry->offset, (unsigned long long)__entry->len, (unsigned long long)__entry->size, __entry->ret) ); #endif /* _TRACE_HUGETLBFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
183 184 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fcntl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> #include <linux/uaccess.h> #include "internal.h" #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. */ if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ if (O_NONBLOCK != O_NDELAY) if (arg & O_NDELAY) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT) && !(filp->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); if (error) return error; /* * ->fasync() is responsible for setting the FASYNC bit. */ if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); if (error < 0) goto out; if (error > 0) error = 0; } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: return error; } /* * Allocate an file->f_owner struct if it doesn't exist, handling racing * allocations correctly. */ int file_f_owner_allocate(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) return 0; f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); if (!f_owner) return -ENOMEM; rwlock_init(&f_owner->lock); f_owner->file = file; /* If someone else raced us, drop our allocation. */ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) kfree(f_owner); return 0; } EXPORT_SYMBOL(file_f_owner_allocate); void file_f_owner_release(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) { put_pid(f_owner->pid); kfree(f_owner); } } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (WARN_ON_ONCE(!f_owner)) return; write_lock_irq(&f_owner->lock); if (force || !f_owner->pid) { put_pid(f_owner->pid); f_owner->pid = get_pid(pid); f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); security_file_set_fowner(filp); f_owner->uid = cred->uid; f_owner->euid = cred->euid; } } write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; int ret = 0; might_sleep(); type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ if (who == INT_MIN) return -EINVAL; type = PIDTYPE_PGID; who = -who; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); if (who) { pid = find_vpid(who); if (!pid) ret = -ESRCH; } if (!ret) __f_setown(filp, pid, type, force); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (!f_owner) return pid; read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) { pid = pid_vnr(f_owner->pid); if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); read_unlock_irq(&f_owner->lock); return pid; } static int f_setown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; struct pid *pid; int type; int ret; ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) return -EFAULT; switch (owner.type) { case F_OWNER_TID: type = PIDTYPE_PID; break; case F_OWNER_PID: type = PIDTYPE_TGID; break; case F_OWNER_PGRP: type = PIDTYPE_PGID; break; default: return -EINVAL; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) ret = -ESRCH; else __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; } static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; struct fown_struct *f_owner; enum pid_type pid_type = PIDTYPE_PID; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) owner.pid = pid_vnr(f_owner->pid); rcu_read_unlock(); pid_type = f_owner->pid_type; } switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; case PIDTYPE_TGID: owner.type = F_OWNER_PID; break; case PIDTYPE_PGID: owner.type = F_OWNER_PGRP; break; default: WARN_ON(1); ret = -EINVAL; break; } if (f_owner) read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); if (ret) ret = -EFAULT; } return ret; } #ifdef CONFIG_CHECKPOINT_RESTORE static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; uid_t src[2] = {0, 0}; int err; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); src[0] = from_kuid(user_ns, f_owner->uid); src[1] = from_kuid(user_ns, f_owner->euid); read_unlock_irq(&f_owner->lock); } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); return err; } #else static int f_getowner_uids(struct file *filp, unsigned long arg) { return -EINVAL; } #endif static bool rw_hint_valid(u64 hint) { BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: case RWH_WRITE_LIFE_LONG: case RWH_WRITE_LIFE_EXTREME: return true; default: return false; } } static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint = READ_ONCE(inode->i_write_hint); if (copy_to_user(argp, &hint, sizeof(*argp))) return -EFAULT; return 0; } static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; if (copy_from_user(&hint, argp, sizeof(hint))) return -EFAULT; if (!rw_hint_valid(hint)) return -EINVAL; WRITE_ONCE(inode->i_write_hint, hint); /* * file->f_mapping->host may differ from inode. As an example, * blkdev_open() modifies file->f_mapping. */ if (file->f_mapping->host != inode) WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); return 0; } /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. * * Technically we didn't need a ref at all, and 'fdget()' was * overkill, but given our lockless file pointer lookup, the * alternatives are complicated. */ return fd_file(f) == filp; } /* Let the caller figure out whether a given file was just created. */ static long f_created_query(const struct file *filp) { return !!(filp->f_mode & FMODE_CREATED); } static int f_owner_sig(struct file *filp, int signum, bool setsig) { int ret = 0; struct fown_struct *f_owner; might_sleep(); if (setsig) { if (!valid_signal(signum)) return -EINVAL; ret = file_f_owner_allocate(filp); if (ret) return ret; } f_owner = file_f_owner(filp); if (setsig) f_owner->signum = signum; else if (f_owner) ret = f_owner->signum; return ret; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_CREATED_QUERY: err = f_created_query(filp); break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_DUPFD_QUERY: err = f_dupfd_query(argi, filp); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_GETLK: #endif case F_GETLK: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_getlk(filp, cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_SETLK: case F_OFD_SETLKW: fallthrough; #endif case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* * XXX If f_owner is a process group, the * negative return value will get converted * into an error. Oops. If we keep the * current syscall conventions, the only way * to fix this will be in libc. */ err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); break; case F_SETOWN_EX: err = f_setown_ex(filp, arg); break; case F_GETOWNER_UIDS: err = f_getowner_uids(filp, arg); break; case F_GETSIG: err = f_owner_sig(filp, 0, false); break; case F_SETSIG: err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: err = fcntl_get_rw_hint(filp, cmd, arg); break; case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; } return err; } static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: case F_GETFD: case F_SETFD: case F_GETFL: return 1; } return 0; } SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd_raw, f)(fd); long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); return err; } #if BITS_PER_LONG == 32 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; CLASS(fd_raw, f)(fd); struct flock64 flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK64: case F_OFD_GETLK: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_getlk64(fd_file(f), cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_setlk64(fd, fd_file(f), cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } #endif #ifdef CONFIG_COMPAT /* careful - don't use anywhere else */ #define copy_flock_fields(dst, src) \ (dst)->l_type = (src)->l_type; \ (dst)->l_whence = (src)->l_whence; \ (dst)->l_start = (src)->l_start; \ (dst)->l_len = (src)->l_len; \ (dst)->l_pid = (src)->l_pid; static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) { struct compat_flock fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { struct compat_flock64 fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { struct compat_flock fl; memset(&fl, 0, sizeof(struct compat_flock)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { struct compat_flock64 fl; BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } #undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) { switch (cmd) { case F_GETLK64: return F_GETLK; case F_SETLK64: return F_SETLK; case F_SETLKW64: return F_SETLKW; } return cmd; } /* * GETLK was successful and we need to return the data, but it needs to fit in * the compat structure. * l_start shouldn't be too big, unless the original start + end is greater than * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return * -EOVERFLOW in that case. l_len could be too big, in which case we just * truncate it, and only allow the app to see that part of the conflicting lock * that might make sense to it anyway */ static int fixup_compat_flock(struct flock *flock) { if (flock->l_start > COMPAT_OFF_T_MAX) return -EOVERFLOW; if (flock->l_len > COMPAT_OFF_T_MAX) flock->l_len = COMPAT_OFF_T_MAX; return 0; } static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { CLASS(fd_raw, f)(fd); struct flock flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (err) break; err = fixup_compat_flock(&flock); if (!err) err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (!err) err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { return do_compat_fcntl64(fd, cmd, arg); } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { switch (cmd) { case F_GETLK64: case F_SETLK64: case F_SETLKW64: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: return -EINVAL; } return do_compat_fcntl64(fd, cmd, arg); } #endif /* Table to convert sigio signal codes into poll band bitmaps */ static const __poll_t band_table[NSIGPOLL] = { EPOLLIN | EPOLLRDNORM, /* POLL_IN */ EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND, /* POLL_OUT */ EPOLLIN | EPOLLRDNORM | EPOLLMSG, /* POLL_MSG */ EPOLLERR, /* POLL_ERR */ EPOLLPRI | EPOLLRDBAND, /* POLL_PRI */ EPOLLHUP | EPOLLERR /* POLL_HUP */ }; static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { const struct cred *cred; int ret; rcu_read_lock(); cred = __task_cred(p); ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; } static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, int fd, int reason, enum pid_type type) { /* * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; switch (signum) { default: { kernel_siginfo_t si; /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* * Posix definies POLL_IN and friends to be signal * specific si_codes for SIG_POLL. Linux extended * these si_codes to other signals in a way that is * ambiguous if other signals also have signal * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; } fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; unsigned long flags; struct pid *pid; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigio_to_task(p, fown, fd, band, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct file *file) { struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; fown = file_f_owner(file); if (!fown) return 0; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; ret = 1; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigurg_to_task(p, fown, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); return ret; } static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, * do nothing and return 0. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * */ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } struct fasync_struct *fasync_alloc(void) { return kmem_cache_alloc(fasync_cache, GFP_KERNEL); } /* * NOTE! This can be used only for unused fasync entries: * entries that actually got inserted on the fasync list * need to be released by rcu - see fasync_remove_entry. */ void fasync_free(struct fasync_struct *new) { kmem_cache_free(fasync_cache, new); } /* * Insert a new entry into the fasync list. Return the pointer to the * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; write_unlock_irq(&fa->fa_lock); goto out; } rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return fa; } /* * Add a fasync entry. Return negative on error, positive if * added, and zero if did nothing but change an existing one. */ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *new; new = fasync_alloc(); if (!new) return -ENOMEM; /* * fasync_insert_entry() returns the old (update) entry if * it existed. * * So free the (unused) new entry and return 0 to let the * caller know that we didn't add any new fasync entries. */ if (fasync_insert_entry(fd, filp, fapp, new)) { fasync_free(new); return 0; } return 1; } /* * fasync_helper() is used by almost all character device drivers * to set up the fasync queue, and for regular files by the file * lease code. It returns negative on error, 0 if it did no changes * and positive if it added/deleted the entry. */ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) { if (!on) return fasync_remove_entry(filp, fapp); return fasync_add_entry(fd, filp, fapp); } EXPORT_SYMBOL(fasync_helper); /* * rcu_read_lock() is held */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = file_f_owner(fa->fa_file); if (!fown) goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { rcu_read_lock(); kill_fasync_rcu(rcu_dereference(*fp), sig, band); rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { /* * Please add new bits here to ensure allocation uniqueness. * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | __FMODE_EXEC)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } module_init(fcntl_init) |
208 209 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/bits.h> #include <linux/bug.h> #include <linux/cleanup.h> #include <linux/compiler_types.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include "access.h" #include "limits.h" #include "object.h" #include "ruleset.h" static struct landlock_ruleset *create_ruleset(const u32 num_layers) { struct landlock_ruleset *new_ruleset; new_ruleset = kzalloc(struct_size(new_ruleset, access_masks, num_layers), GFP_KERNEL_ACCOUNT); if (!new_ruleset) return ERR_PTR(-ENOMEM); refcount_set(&new_ruleset->usage, 1); mutex_init(&new_ruleset->lock); new_ruleset->root_inode = RB_ROOT; #if IS_ENABLED(CONFIG_INET) new_ruleset->root_net_port = RB_ROOT; #endif /* IS_ENABLED(CONFIG_INET) */ new_ruleset->num_layers = num_layers; /* * hierarchy = NULL * num_rules = 0 * access_masks[] = 0 */ return new_ruleset; } struct landlock_ruleset * landlock_create_ruleset(const access_mask_t fs_access_mask, const access_mask_t net_access_mask, const access_mask_t scope_mask) { struct landlock_ruleset *new_ruleset; /* Informs about useless ruleset. */ if (!fs_access_mask && !net_access_mask && !scope_mask) return ERR_PTR(-ENOMSG); new_ruleset = create_ruleset(1); if (IS_ERR(new_ruleset)) return new_ruleset; if (fs_access_mask) landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0); if (net_access_mask) landlock_add_net_access_mask(new_ruleset, net_access_mask, 0); if (scope_mask) landlock_add_scope_mask(new_ruleset, scope_mask, 0); return new_ruleset; } static void build_check_rule(void) { const struct landlock_rule rule = { .num_layers = ~0, }; BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS); } static bool is_object_pointer(const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return true; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return false; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return false; } } static struct landlock_rule * create_rule(const struct landlock_id id, const struct landlock_layer (*const layers)[], const u32 num_layers, const struct landlock_layer *const new_layer) { struct landlock_rule *new_rule; u32 new_num_layers; build_check_rule(); if (new_layer) { /* Should already be checked by landlock_merge_ruleset(). */ if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS)) return ERR_PTR(-E2BIG); new_num_layers = num_layers + 1; } else { new_num_layers = num_layers; } new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers), GFP_KERNEL_ACCOUNT); if (!new_rule) return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { /* This should be catched by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } new_rule->key = id.key; new_rule->num_layers = new_num_layers; /* Copies the original layer stack. */ memcpy(new_rule->layers, layers, flex_array_size(new_rule, layers, num_layers)); if (new_layer) /* Adds a copy of @new_layer on the layer stack. */ new_rule->layers[new_rule->num_layers - 1] = *new_layer; return new_rule; } static struct rb_root *get_root(struct landlock_ruleset *const ruleset, const enum landlock_key_type key_type) { switch (key_type) { case LANDLOCK_KEY_INODE: return &ruleset->root_inode; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: return &ruleset->root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } } static void free_rule(struct landlock_rule *const rule, const enum landlock_key_type key_type) { might_sleep(); if (!rule) return; if (is_object_pointer(key_type)) landlock_put_object(rule->key.object); kfree(rule); } static void build_check_ruleset(void) { const struct landlock_ruleset ruleset = { .num_rules = ~0, .num_layers = ~0, }; BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES); BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS); } /** * insert_rule - Create and insert a rule in a ruleset * * @ruleset: The ruleset to be updated. * @id: The ID to build the new rule with. The underlying kernel object, if * any, must be held by the caller. * @layers: One or multiple layers to be copied into the new rule. * @num_layers: The number of @layers entries. * * When user space requests to add a new rule to a ruleset, @layers only * contains one entry and this entry is not assigned to any level. In this * case, the new rule will extend @ruleset, similarly to a boolean OR between * access rights. * * When merging a ruleset in a domain, or copying a domain, @layers will be * added to @ruleset as new constraints, similarly to a boolean AND between * access rights. */ static int insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const struct landlock_layer (*const layers)[], const size_t num_layers) { struct rb_node **walker_node; struct rb_node *parent_node = NULL; struct landlock_rule *new_rule; struct rb_root *root; might_sleep(); lockdep_assert_held(&ruleset->lock); if (WARN_ON_ONCE(!layers)) return -ENOENT; if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object)) return -ENOENT; root = get_root(ruleset, id.type); if (IS_ERR(root)) return PTR_ERR(root); walker_node = &root->rb_node; while (*walker_node) { struct landlock_rule *const this = rb_entry(*walker_node, struct landlock_rule, node); if (this->key.data != id.key.data) { parent_node = *walker_node; if (this->key.data < id.key.data) walker_node = &((*walker_node)->rb_right); else walker_node = &((*walker_node)->rb_left); continue; } /* Only a single-level layer should match an existing rule. */ if (WARN_ON_ONCE(num_layers != 1)) return -EINVAL; /* If there is a matching rule, updates it. */ if ((*layers)[0].level == 0) { /* * Extends access rights when the request comes from * landlock_add_rule(2), i.e. @ruleset is not a domain. */ if (WARN_ON_ONCE(this->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(this->layers[0].level != 0)) return -EINVAL; this->layers[0].access |= (*layers)[0].access; return 0; } if (WARN_ON_ONCE(this->layers[0].level == 0)) return -EINVAL; /* * Intersects access rights when it is a merge between a * ruleset and a domain. */ new_rule = create_rule(id, &this->layers, this->num_layers, &(*layers)[0]); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_replace_node(&this->node, &new_rule->node, root); free_rule(this, id.type); return 0; } /* There is no match for @id. */ build_check_ruleset(); if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES) return -E2BIG; new_rule = create_rule(id, layers, num_layers, NULL); if (IS_ERR(new_rule)) return PTR_ERR(new_rule); rb_link_node(&new_rule->node, parent_node, walker_node); rb_insert_color(&new_rule->node, root); ruleset->num_rules++; return 0; } static void build_check_layer(void) { const struct landlock_layer layer = { .level = ~0, .access = ~0, }; BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS); BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS); } /* @ruleset must be locked by the caller. */ int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access) { struct landlock_layer layers[] = { { .access = access, /* When @level is zero, insert_rule() extends @ruleset. */ .level = 0, } }; build_check_layer(); return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers)); } static void get_hierarchy(struct landlock_hierarchy *const hierarchy) { if (hierarchy) refcount_inc(&hierarchy->usage); } static void put_hierarchy(struct landlock_hierarchy *hierarchy) { while (hierarchy && refcount_dec_and_test(&hierarchy->usage)) { const struct landlock_hierarchy *const freeme = hierarchy; hierarchy = hierarchy->parent; kfree(freeme); } } static int merge_tree(struct landlock_ruleset *const dst, struct landlock_ruleset *const src, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *src_root; int err = 0; might_sleep(); lockdep_assert_held(&dst->lock); lockdep_assert_held(&src->lock); src_root = get_root(src, key_type); if (IS_ERR(src_root)) return PTR_ERR(src_root); /* Merges the @src tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root, node) { struct landlock_layer layers[] = { { .level = dst->num_layers, } }; const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; if (WARN_ON_ONCE(walker_rule->num_layers != 1)) return -EINVAL; if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) return -EINVAL; layers[0].access = walker_rule->layers[0].access; err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers)); if (err) return err; } return err; } static int merge_ruleset(struct landlock_ruleset *const dst, struct landlock_ruleset *const src) { int err = 0; might_sleep(); /* Should already be checked by landlock_merge_ruleset() */ if (WARN_ON_ONCE(!src)) return 0; /* Only merge into a domain. */ if (WARN_ON_ONCE(!dst || !dst->hierarchy)) return -EINVAL; /* Locks @dst first because we are its only owner. */ mutex_lock(&dst->lock); mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING); /* Stacks the new layer. */ if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) { err = -EINVAL; goto out_unlock; } dst->access_masks[dst->num_layers - 1] = landlock_upgrade_handled_access_masks(src->access_masks[0]); /* Merges the @src inode tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Merges the @src network port tree. */ err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ out_unlock: mutex_unlock(&src->lock); mutex_unlock(&dst->lock); return err; } static int inherit_tree(struct landlock_ruleset *const parent, struct landlock_ruleset *const child, const enum landlock_key_type key_type) { struct landlock_rule *walker_rule, *next_rule; struct rb_root *parent_root; int err = 0; might_sleep(); lockdep_assert_held(&parent->lock); lockdep_assert_held(&child->lock); parent_root = get_root(parent, key_type); if (IS_ERR(parent_root)) return PTR_ERR(parent_root); /* Copies the @parent inode or network tree. */ rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, parent_root, node) { const struct landlock_id id = { .key = walker_rule->key, .type = key_type, }; err = insert_rule(child, id, &walker_rule->layers, walker_rule->num_layers); if (err) return err; } return err; } static int inherit_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const child) { int err = 0; might_sleep(); if (!parent) return 0; /* Locks @child first because we are its only owner. */ mutex_lock(&child->lock); mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); /* Copies the @parent inode tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_INODE); if (err) goto out_unlock; #if IS_ENABLED(CONFIG_INET) /* Copies the @parent network port tree. */ err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT); if (err) goto out_unlock; #endif /* IS_ENABLED(CONFIG_INET) */ if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) { err = -EINVAL; goto out_unlock; } /* Copies the parent layer stack and leaves a space for the new layer. */ memcpy(child->access_masks, parent->access_masks, flex_array_size(parent, access_masks, parent->num_layers)); if (WARN_ON_ONCE(!parent->hierarchy)) { err = -EINVAL; goto out_unlock; } get_hierarchy(parent->hierarchy); child->hierarchy->parent = parent->hierarchy; out_unlock: mutex_unlock(&parent->lock); mutex_unlock(&child->lock); return err; } static void free_ruleset(struct landlock_ruleset *const ruleset) { struct landlock_rule *freeme, *next; might_sleep(); rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode, node) free_rule(freeme, LANDLOCK_KEY_INODE); #if IS_ENABLED(CONFIG_INET) rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_net_port, node) free_rule(freeme, LANDLOCK_KEY_NET_PORT); #endif /* IS_ENABLED(CONFIG_INET) */ put_hierarchy(ruleset->hierarchy); kfree(ruleset); } void landlock_put_ruleset(struct landlock_ruleset *const ruleset) { might_sleep(); if (ruleset && refcount_dec_and_test(&ruleset->usage)) free_ruleset(ruleset); } static void free_ruleset_work(struct work_struct *const work) { struct landlock_ruleset *ruleset; ruleset = container_of(work, struct landlock_ruleset, work_free); free_ruleset(ruleset); } void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset) { if (ruleset && refcount_dec_and_test(&ruleset->usage)) { INIT_WORK(&ruleset->work_free, free_ruleset_work); schedule_work(&ruleset->work_free); } } /** * landlock_merge_ruleset - Merge a ruleset with a domain * * @parent: Parent domain. * @ruleset: New ruleset to be merged. * * Returns the intersection of @parent and @ruleset, or returns @parent if * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty. */ struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset) { struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL; u32 num_layers; int err; might_sleep(); if (WARN_ON_ONCE(!ruleset || parent == ruleset)) return ERR_PTR(-EINVAL); if (parent) { if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS) return ERR_PTR(-E2BIG); num_layers = parent->num_layers + 1; } else { num_layers = 1; } /* Creates a new domain... */ new_dom = create_ruleset(num_layers); if (IS_ERR(new_dom)) return new_dom; new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT); if (!new_dom->hierarchy) return ERR_PTR(-ENOMEM); refcount_set(&new_dom->hierarchy->usage, 1); /* ...as a child of @parent... */ err = inherit_ruleset(parent, new_dom); if (err) return ERR_PTR(err); /* ...and including @ruleset. */ err = merge_ruleset(new_dom, ruleset); if (err) return ERR_PTR(err); return no_free_ptr(new_dom); } /* * The returned access has the same lifetime as @ruleset. */ const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id) { const struct rb_root *root; const struct rb_node *node; root = get_root((struct landlock_ruleset *)ruleset, id.type); if (IS_ERR(root)) return NULL; node = root->rb_node; while (node) { struct landlock_rule *this = rb_entry(node, struct landlock_rule, node); if (this->key.data == id.key.data) return this; if (this->key.data < id.key.data) node = node->rb_right; else node = node->rb_left; } return NULL; } /* * @layer_masks is read and may be updated according to the access request and * the matching rule. * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks). * * Returns true if the request is allowed (i.e. relevant layer masks for the * request are empty). */ bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size) { size_t layer_level; if (!access_request || !layer_masks) return true; if (!rule) return false; /* * An access is granted if, for each policy layer, at least one rule * encountered on the pathwalk grants the requested access, * regardless of its position in the layer stack. We must then check * the remaining layers for each inode, from the first added layer to * the last one. When there is multiple requested accesses, for each * policy layer, the full set of requested accesses may not be granted * by only one rule, but by the union (binary OR) of multiple rules. * E.g. /a/b <execute> + /a <read> => /a/b <execute + read> */ for (layer_level = 0; layer_level < rule->num_layers; layer_level++) { const struct landlock_layer *const layer = &rule->layers[layer_level]; const layer_mask_t layer_bit = BIT_ULL(layer->level - 1); const unsigned long access_req = access_request; unsigned long access_bit; bool is_empty; /* * Records in @layer_masks which layer grants access to each * requested access. */ is_empty = true; for_each_set_bit(access_bit, &access_req, masks_array_size) { if (layer->access & BIT_ULL(access_bit)) (*layer_masks)[access_bit] &= ~layer_bit; is_empty = is_empty && !(*layer_masks)[access_bit]; } if (is_empty) return true; } return false; } typedef access_mask_t get_access_mask_t(const struct landlock_ruleset *const ruleset, const u16 layer_level); /** * landlock_init_layer_masks - Initialize layer masks from an access request * * Populates @layer_masks such that for each access right in @access_request, * the bits for all the layers are set where this access right is handled. * * @domain: The domain that defines the current restrictions. * @access_request: The requested access rights to check. * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type. * @key_type: The key type to switch between access masks of different types. * * Returns: An access mask where each access right bit is set which is handled * in any of the active layers in @domain. */ access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type) { access_mask_t handled_accesses = 0; size_t layer_level, num_access; get_access_mask_t *get_access_mask; switch (key_type) { case LANDLOCK_KEY_INODE: get_access_mask = landlock_get_fs_access_mask; num_access = LANDLOCK_NUM_ACCESS_FS; break; #if IS_ENABLED(CONFIG_INET) case LANDLOCK_KEY_NET_PORT: get_access_mask = landlock_get_net_access_mask; num_access = LANDLOCK_NUM_ACCESS_NET; break; #endif /* IS_ENABLED(CONFIG_INET) */ default: WARN_ON_ONCE(1); return 0; } memset(layer_masks, 0, array_size(sizeof((*layer_masks)[0]), num_access)); /* An empty access request can happen because of O_WRONLY | O_RDWR. */ if (!access_request) return 0; /* Saves all handled accesses per layer. */ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { const unsigned long access_req = access_request; const access_mask_t access_mask = get_access_mask(domain, layer_level); unsigned long access_bit; for_each_set_bit(access_bit, &access_req, num_access) { if (BIT_ULL(access_bit) & access_mask) { (*layer_masks)[access_bit] |= BIT_ULL(layer_level); handled_accesses |= BIT_ULL(access_bit); } } } return handled_accesses; } |
49 49 49 49 49 49 19 19 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/coproc.h * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Christoffer Dall <c.dall@virtualopensystems.com> */ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ #include <linux/bsearch.h> #define reg_to_encoding(x) \ sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) struct sys_reg_params { u8 Op0; u8 Op1; u8 CRn; u8 CRm; u8 Op2; u64 regval; bool is_write; }; #define encoding_to_params(reg) \ ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ .Op1 = sys_reg_Op1(reg), \ .CRn = sys_reg_CRn(reg), \ .CRm = sys_reg_CRm(reg), \ .Op2 = sys_reg_Op2(reg) }) #define esr_sys64_to_params(esr) \ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ .Op1 = ((esr) >> 14) & 0x7, \ .CRn = ((esr) >> 10) & 0xf, \ .CRm = ((esr) >> 1) & 0xf, \ .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) #define esr_cp1x_32_to_params(esr) \ ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7, \ .CRn = ((esr) >> 10) & 0xf, \ .CRm = ((esr) >> 1) & 0xf, \ .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) struct sys_reg_desc { /* Sysreg string for debug */ const char *name; enum { AA32_DIRECT, AA32_LO, AA32_HI, } aarch32_map; /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; u8 CRn; u8 CRm; u8 Op2; /* Trapped access from guest, if non-NULL. */ bool (*access)(struct kvm_vcpu *, struct sys_reg_params *, const struct sys_reg_desc *); /* * Initialization for vcpu. Return initialized value, or KVM * sanitized value for ID registers. */ u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; /* Value (usually reset value), or write mask for idregs */ u64 val; /* Custom get/set_user functions, fallback to generic if NULL */ int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); /* Return mask of REG_* runtime visibility overrides */ unsigned int (*visibility)(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd); }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ #define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ #define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, char *fmt, ...) { va_list va; va_start(va, fmt); /* Look, we even formatted it for you to paste into the table! */ kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", &(struct va_format){ fmt, &va }, p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read"); va_end(va); } static inline void print_sys_reg_instr(const struct sys_reg_params *p) { /* GCC warns on an empty format string */ print_sys_reg_msg(p, "%s", ""); } static inline bool ignore_write(struct kvm_vcpu *vcpu, const struct sys_reg_params *p) { return true; } static inline bool read_zero(struct kvm_vcpu *vcpu, struct sys_reg_params *p) { p->regval = 0; return true; } /* Reset functions */ static inline u64 reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; return __vcpu_sys_reg(vcpu, r->reg); } static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = r->val; return __vcpu_sys_reg(vcpu, r->reg); } static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (likely(!r->visibility)) return 0; return r->visibility(vcpu, r); } static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_HIDDEN; } static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_RAZ; } static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_USER_WI; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, const struct sys_reg_desc *i2) { BUG_ON(i1 == i2); if (!i1) return 1; else if (!i2) return -1; if (i1->Op0 != i2->Op0) return i1->Op0 - i2->Op0; if (i1->Op1 != i2->Op1) return i1->Op1 - i2->Op1; if (i1->CRn != i2->CRn) return i1->CRn - i2->CRn; if (i1->CRm != i2->CRm) return i1->CRm - i2->CRm; return i1->Op2 - i2->Op2; } static inline int match_sys_reg(const void *key, const void *elt) { const unsigned long pval = (unsigned long)key; const struct sys_reg_desc *r = elt; return pval - reg_to_encoding(r); } static inline const struct sys_reg_desc * find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], unsigned int num) { unsigned long pval = reg_to_encoding(params); return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x #define CRn(_x) .CRn = _x #define CRm(_x) .CRm = _x #define Op2(_x) .Op2 = _x #define SYS_DESC(reg) \ .name = #reg, \ Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) #define CP15_SYS_DESC(reg) \ .name = #reg, \ .aarch32_map = AA32_DIRECT, \ Op0(0), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ |
25 25 25 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 | // SPDX-License-Identifier: GPL-2.0-or-later /* auditfilter.c -- filtering of audit events * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <net/net_namespace.h> #include <net/sock.h> #include "audit.h" /* * Locking model: * * audit_filter_mutex: * Synchronizes writes and blocking reads of audit's filterlist * data. Rcu is used to traverse the filterlist and access * contents of structs audit_entry, audit_watch and opaque * LSM rules during filtering. If modified, these structures * must be copied and replace their counterparts in the filterlist. * An audit_parent struct is not accessed during filtering, so may * be written directly provided audit_filter_mutex is held. */ /* Audit filter lists, defined in <linux/audit.h> */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), LIST_HEAD_INIT(audit_filter_list[2]), LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), LIST_HEAD_INIT(audit_filter_list[5]), LIST_HEAD_INIT(audit_filter_list[6]), LIST_HEAD_INIT(audit_filter_list[7]), #if AUDIT_NR_FILTERS != 8 #error Fix audit_filter_list initialiser #endif }; static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_rules_list[0]), LIST_HEAD_INIT(audit_rules_list[1]), LIST_HEAD_INIT(audit_rules_list[2]), LIST_HEAD_INIT(audit_rules_list[3]), LIST_HEAD_INIT(audit_rules_list[4]), LIST_HEAD_INIT(audit_rules_list[5]), LIST_HEAD_INIT(audit_rules_list[6]), LIST_HEAD_INIT(audit_rules_list[7]), }; DEFINE_MUTEX(audit_filter_mutex); static void audit_free_lsm_field(struct audit_field *f) { switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: kfree(f->lsm_str); security_audit_rule_free(f->lsm_rule); } } static inline void audit_free_rule(struct audit_entry *e) { int i; struct audit_krule *erule = &e->rule; /* some rules don't have associated watches */ if (erule->watch) audit_put_watch(erule->watch); if (erule->fields) for (i = 0; i < erule->field_count; i++) audit_free_lsm_field(&erule->fields[i]); kfree(erule->fields); kfree(erule->filterkey); kfree(e); } void audit_free_rule_rcu(struct rcu_head *head) { struct audit_entry *e = container_of(head, struct audit_entry, rcu); audit_free_rule(e); } /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { struct audit_entry *entry; struct audit_field *fields; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (unlikely(!entry)) return NULL; fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; } entry->rule.fields = fields; return entry; } /* Unpack a filter field's string representation from user-space * buffer. */ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) { char *str; if (!*bufp || (len == 0) || (len > *remain)) return ERR_PTR(-EINVAL); /* Of the currently implemented string fields, PATH_MAX * defines the longest valid length. */ if (len > PATH_MAX) return ERR_PTR(-ENAMETOOLONG); str = kmalloc(len + 1, GFP_KERNEL); if (unlikely(!str)) return ERR_PTR(-ENOMEM); memcpy(str, *bufp, len); str[len] = 0; *bufp += len; *remain -= len; return str; } /* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if ((krule->listnr != AUDIT_FILTER_EXIT && krule->listnr != AUDIT_FILTER_URING_EXIT) || krule->inode_f || krule->watch || krule->tree || (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; return 0; } static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { unsigned n = *list++; if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { kfree(p); return -EINVAL; } p[AUDIT_WORD(n)] |= AUDIT_BIT(n); } if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { kfree(p); return -EINVAL; } classes[class] = p; return 0; } int audit_match_class(int class, unsigned syscall) { if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) return 0; if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) return 0; return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); } #ifdef CONFIG_AUDITSYSCALL static inline int audit_match_class_bits(int class, u32 *mask) { int i; if (classes[class]) { for (i = 0; i < AUDIT_BITMASK_SIZE; i++) if (mask[i] & classes[class][i]) return 0; } return 1; } static int audit_match_signal(struct audit_entry *entry) { struct audit_field *arch = entry->rule.arch_f; if (!arch) { /* When arch is unspecified, we must check both masks on biarch * as syscall number alone is ambiguous. */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, entry->rule.mask) && audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, entry->rule.mask)); } switch (audit_classify_arch(arch->val)) { case 0: /* native */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, entry->rule.mask)); case 1: /* 32bit on biarch */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, entry->rule.mask)); default: return 1; } } #endif /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { unsigned listnr; struct audit_entry *entry; int i, err; err = -EINVAL; listnr = rule->flags & ~AUDIT_FILTER_PREPEND; switch (listnr) { default: goto exit_err; #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: pr_err("AUDIT_FILTER_ENTRY is deprecated\n"); goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_URING_EXIT: case AUDIT_FILTER_TASK: #endif case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { pr_err("AUDIT_POSSIBLE is deprecated\n"); goto exit_err; } if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; err = -ENOMEM; entry = audit_init_entry(rule->field_count); if (!entry) goto exit_err; entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; entry->rule.listnr = listnr; entry->rule.action = rule->action; entry->rule.field_count = rule->field_count; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) entry->rule.mask[i] = rule->mask[i]; for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; __u32 *class; if (!(*p & AUDIT_BIT(bit))) continue; *p &= ~AUDIT_BIT(bit); class = classes[i]; if (class) { int j; for (j = 0; j < AUDIT_BITMASK_SIZE; j++) entry->rule.mask[j] |= class[j]; } } return entry; exit_err: return ERR_PTR(err); } static u32 audit_ops[] = { [Audit_equal] = AUDIT_EQUAL, [Audit_not_equal] = AUDIT_NOT_EQUAL, [Audit_bitmask] = AUDIT_BIT_MASK, [Audit_bittest] = AUDIT_BIT_TEST, [Audit_lt] = AUDIT_LESS_THAN, [Audit_gt] = AUDIT_GREATER_THAN, [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, }; static u32 audit_to_op(u32 op) { u32 n; for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) ; return n; } /* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch (f->type) { case AUDIT_MSGTYPE: if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; case AUDIT_FSTYPE: if (entry->rule.listnr != AUDIT_FILTER_FS) return -EINVAL; break; case AUDIT_PERM: if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT) return -EINVAL; break; } switch (entry->rule.listnr) { case AUDIT_FILTER_FS: switch (f->type) { case AUDIT_FSTYPE: case AUDIT_FILTERKEY: break; default: return -EINVAL; } } /* Check for valid field type and op */ switch (f->type) { case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: case AUDIT_PERS: /* <uapi/linux/personality.h> */ case AUDIT_DEVMINOR: /* all ops are valid */ break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: case AUDIT_OBJ_UID: case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: case AUDIT_PID: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: case AUDIT_SESSIONID: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_WATCH: case AUDIT_DIR: case AUDIT_FILTERKEY: case AUDIT_LOGINUID_SET: case AUDIT_ARCH: case AUDIT_FSTYPE: case AUDIT_PERM: case AUDIT_FILETYPE: case AUDIT_FIELD_COMPARE: case AUDIT_EXE: /* only equal and not equal valid ops */ if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; default: /* field not recognized */ return -EINVAL; } /* Check for select valid field values */ switch (f->type) { case AUDIT_LOGINUID_SET: if ((f->val != 0) && (f->val != 1)) return -EINVAL; break; case AUDIT_PERM: if (f->val & ~15) return -EINVAL; break; case AUDIT_FILETYPE: if (f->val & ~S_IFMT) return -EINVAL; break; case AUDIT_FIELD_COMPARE: if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; case AUDIT_SADDR_FAM: if (f->val >= AF_MAX) return -EINVAL; break; default: break; } return 0; } /* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { int err = 0; struct audit_entry *entry; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) goto exit_nofree; bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; u32 f_val; err = -EINVAL; f->op = audit_to_op(data->fieldflags[i]); if (f->op == Audit_bad) goto exit_free; f->type = data->fields[i]; f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } err = audit_field_valid(entry, f); if (err) goto exit_free; err = -EINVAL; switch (f->type) { case AUDIT_LOGINUID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; } else if (err) goto exit_free; break; case AUDIT_WATCH: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } entry->rule.buflen += f_val; break; case AUDIT_DIR: str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; entry->rule.buflen += f_val; break; case AUDIT_INODE: f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; default: f->val = f_val; break; } } if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) entry->rule.inode_f = NULL; exit_nofree: return entry; exit_free: if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ if (entry->rule.exe) audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } /* Pack a filter field's string representation into data block. */ static inline size_t audit_pack_string(void **bufp, const char *str) { size_t len = strlen(str); memcpy(*bufp, str, len); *bufp += len; return len; } /* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; void *bufp; int i; data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; data->action = krule->action; data->field_count = krule->field_count; bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; data->fieldflags[i] = audit_ops[f->op]; switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: data->buflen += data->values[i] = audit_pack_string(&bufp, f->lsm_str); break; case AUDIT_WATCH: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_watch_path(krule->watch)); break; case AUDIT_DIR: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_tree_path(krule->tree)); break; case AUDIT_FILTERKEY: data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; case AUDIT_EXE: data->buflen += data->values[i] = audit_pack_string(&bufp, audit_mark_path(krule->exe)); break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; data->values[i] = AUDIT_UID_UNSET; break; } fallthrough; /* if set */ default: data->values[i] = f->val; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; return data; } /* Compare two rules in kernel format. Considered success if rules * don't match. */ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) { int i; if (a->flags != b->flags || a->pflags != b->pflags || a->listnr != b->listnr || a->action != b->action || a->field_count != b->field_count) return 1; for (i = 0; i < a->field_count; i++) { if (a->fields[i].type != b->fields[i].type || a->fields[i].op != b->fields[i].op) return 1; switch (a->fields[i].type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) return 1; break; case AUDIT_WATCH: if (strcmp(audit_watch_path(a->watch), audit_watch_path(b->watch))) return 1; break; case AUDIT_DIR: if (strcmp(audit_tree_path(a->tree), audit_tree_path(b->tree))) return 1; break; case AUDIT_FILTERKEY: /* both filterkeys exist based on above type compare */ if (strcmp(a->filterkey, b->filterkey)) return 1; break; case AUDIT_EXE: /* both paths exist based on above type compare */ if (strcmp(audit_mark_path(a->exe), audit_mark_path(b->exe))) return 1; break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: case AUDIT_OBJ_UID: if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) return 1; break; case AUDIT_GID: case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) return 1; break; default: if (a->fields[i].val != b->fields[i].val) return 1; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) if (a->mask[i] != b->mask[i]) return 1; return 0; } /* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_lsm_field(struct audit_field *df, struct audit_field *sf) { int ret; char *lsm_str; /* our own copy of lsm_str */ lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); if (unlikely(!lsm_str)) return -ENOMEM; df->lsm_str = lsm_str; /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, (void **)&df->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { pr_warn("audit rule for LSM \'%s\' is invalid\n", df->lsm_str); ret = 0; } return ret; } /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The LSM specific fields * will be updated in the copy. The point is to be able to replace the old * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ struct audit_entry *audit_dupe_rule(struct audit_krule *old) { u32 fcount = old->field_count; struct audit_entry *entry; struct audit_krule *new; char *fk; int i, err = 0; entry = audit_init_entry(fcount); if (unlikely(!entry)) return ERR_PTR(-ENOMEM); new = &entry->rule; new->flags = old->flags; new->pflags = old->pflags; new->listnr = old->listnr; new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->field_count = old->field_count; /* * note that we are OK with not refcounting here; audit_match_tree() * never dereferences tree and we can't get false positives there * since we'd have to have rule gone from the list *and* removed * before the chunks found by lookup had been allocated, i.e. before * the beginning of list scan. */ new->tree = old->tree; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); /* deep copy this information, updating the lsm_rule fields, because * the originals will all be freed when the old rule is freed. */ for (i = 0; i < fcount; i++) { switch (new->fields[i].type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: err = audit_dupe_lsm_field(&new->fields[i], &old->fields[i]); break; case AUDIT_FILTERKEY: fk = kstrdup(old->filterkey, GFP_KERNEL); if (unlikely(!fk)) err = -ENOMEM; else new->filterkey = fk; break; case AUDIT_EXE: err = audit_dupe_exe(new, old); break; } if (err) { if (new->exe) audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } } if (old->watch) { audit_get_watch(old->watch); new->watch = old->watch; } return entry; } /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, struct list_head **p) { struct audit_entry *e, *found = NULL; struct list_head *list; int h; if (entry->rule.inode_f) { h = audit_hash_ino(entry->rule.inode_f->val); *p = list = &audit_inode_hash[h]; } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; list_for_each_entry(e, list, list) if (!audit_compare_rule(&entry->rule, &e->rule)) { found = e; goto out; } } goto out; } else { *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) if (!audit_compare_rule(&entry->rule, &e->rule)) { found = e; goto out; } out: return found; } static u64 prio_low = ~0ULL/2; static u64 prio_high = ~0ULL/2 - 1; /* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; /* If any of these, don't count towards total */ switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } #endif mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (e) { mutex_unlock(&audit_filter_mutex); err = -EEXIST; /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); return err; } if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); /* * normally audit_add_tree_rule() will free it * on failure */ if (tree) audit_put_tree(tree); return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); return err; } } entry->rule.prio = ~0ULL; if (entry->rule.listnr == AUDIT_FILTER_EXIT || entry->rule.listnr == AUDIT_FILTER_URING_EXIT) { if (entry->rule.flags & AUDIT_FILTER_PREPEND) entry->rule.prio = ++prio_high; else entry->rule.prio = --prio_low; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add(&entry->rule.list, &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail(&entry->rule.list, &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules++; if (!audit_match_signal(entry)) audit_signals++; #endif mutex_unlock(&audit_filter_mutex); return err; } /* Remove an existing rule from filterlist. */ int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; /* If any of these, don't count towards total */ switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } #endif mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { ret = -ENOENT; goto out; } if (e->rule.watch) audit_remove_watch_rule(&e->rule); if (e->rule.tree) audit_remove_tree_rule(&e->rule); if (e->rule.exe) audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules--; if (!audit_match_signal(entry)) audit_signals--; #endif list_del_rcu(&e->list); list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); out: mutex_unlock(&audit_filter_mutex); if (tree) audit_put_tree(tree); /* that's the temporary one */ return ret; } /* List rules using struct audit_rule_data. */ static void audit_list_rules(int seq, struct sk_buff_head *q) { struct sk_buff *skb; struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, data, struct_size(data, buf, data->buflen)); if (skb) skb_queue_tail(q, skb); kfree(data); } } skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); } /* Log rule additions and removals */ static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); audit_log_task_context(ab); audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); } /** * audit_rule_change - apply all rules to the specified message type * @type: audit message type * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data */ int audit_rule_change(int type, int seq, void *data, size_t datasz) { int err = 0; struct audit_entry *entry; switch (type) { case AUDIT_ADD_RULE: entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: WARN_ON(1); return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { if (entry->rule.exe) audit_remove_mark(entry->rule.exe); audit_free_rule(entry); } return err; } /** * audit_list_rules_send - list the audit rules * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { struct task_struct *tsk; struct audit_netlink_list *dest; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for * auditctl to read from it... which isn't ever going to * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); put_net(dest->net); kfree(dest); return PTR_ERR(tsk); } return 0; } int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { case Audit_equal: return (left == right); case Audit_not_equal: return (left != right); case Audit_lt: return (left < right); case Audit_le: return (left <= right); case Audit_gt: return (left > right); case Audit_ge: return (left >= right); case Audit_bitmask: return (left & right); case Audit_bittest: return ((left & right) == right); default: return 0; } } int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) { switch (op) { case Audit_equal: return uid_eq(left, right); case Audit_not_equal: return !uid_eq(left, right); case Audit_lt: return uid_lt(left, right); case Audit_le: return uid_lte(left, right); case Audit_gt: return uid_gt(left, right); case Audit_ge: return uid_gte(left, right); case Audit_bitmask: case Audit_bittest: default: return 0; } } int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) { switch (op) { case Audit_equal: return gid_eq(left, right); case Audit_not_equal: return !gid_eq(left, right); case Audit_lt: return gid_lt(left, right); case Audit_le: return gid_lte(left, right); case Audit_gt: return gid_gt(left, right); case Audit_ge: return gid_gte(left, right); case Audit_bitmask: case Audit_bittest: default: return 0; } } /** * parent_len - find the length of the parent portion of a pathname * @path: pathname of which to determine length */ int parent_len(const char *path) { int plen; const char *p; plen = strlen(path); if (plen == 0) return plen; /* disregard trailing slashes */ p = path + plen - 1; while ((*p == '/') && (p > path)) p--; /* walk backward until we find the next slash or hit beginning */ while ((*p != '/') && (p > path)) p--; /* did we find a slash? Then increment to include it in path */ if (*p == '/') p++; return p - path; } /** * audit_compare_dname_path - compare given dentry name with last component in * given path. Return of 0 indicates a match. * @dname: dentry name that we're comparing * @path: full pathname that we're comparing * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; if (parentlen == AUDIT_NAME_FULL) parentlen = parent_len(path); p = path + parentlen; /* handle trailing slashes */ pathlen -= parentlen; while (p[pathlen - 1] == '/') pathlen--; if (pathlen != dlen) return 1; return memcmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) { struct audit_entry *e; int ret = 1; /* Audit by default */ rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; struct lsm_prop prop = { }; pid_t pid; switch (f->type) { case AUDIT_PID: pid = task_tgid_nr(current); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: result = audit_uid_comparator(current_uid(), f->op, f->uid); break; case AUDIT_GID: result = audit_gid_comparator(current_gid(), f->op, f->gid); break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(current), f->op, f->val); break; case AUDIT_MSGTYPE: result = audit_comparator(msgtype, f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { security_current_getlsmprop_subj(&prop); result = security_audit_rule_match( &prop, f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: result = audit_exe_compare(current, e->rule.exe); if (f->op == Audit_not_equal) result = !result; break; default: goto unlock_and_return; } if (result < 0) /* error */ goto unlock_and_return; if (!result) break; } if (result > 0) { if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE) ret = 0; break; } } unlock_and_return: rcu_read_unlock(); return ret; } static int update_lsm_rule(struct audit_krule *r) { struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; int err = 0; if (!security_audit_rule_known(r)) return 0; nentry = audit_dupe_rule(r); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); if (r->watch) list_del(&r->rlist); list_del_rcu(&entry->list); list_del(&r->list); } else { if (r->watch || r->tree) list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); list_replace(&r->list, &nentry->rule.list); } call_rcu(&entry->rcu, audit_free_rule_rcu); return err; } /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ int audit_update_lsm_rules(void) { struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { int res = update_lsm_rule(r); if (!err) err = res; } } mutex_unlock(&audit_filter_mutex); return err; } |
49 50 58 58 9 8 8 58 95 53 45 45 53 4 4 1 2 1 1 1 1 1 1 1 1 1 49 49 49 49 43 22 49 49 49 49 49 49 49 23 26 26 49 49 49 49 49 49 49 1 1 4 4 2 2 1 1 1 3 2 2 3 3 10 10 7 2 1 15 4 14 8 8 10 56 43 43 42 42 41 41 41 41 42 42 42 42 56 56 5 49 49 49 49 49 64 62 53 52 53 52 54 11 11 9 9 9 42 23 20 2 2 42 42 2 1 1 1 1 1 4 1 3 20 1 19 5 46 46 46 1 43 43 43 43 43 3 3 1 2 9 10 10 11 49 49 10 10 10 9 9 9 9 9 9 10 9 9 273 8 41 6 43 49 49 49 49 49 49 49 49 49 49 2 1 2 65 62 2 60 58 14 49 3 1 1 1 2 1 1 5 3 15 15 2 8 10 18 4 14 49 49 16 30 36 2 55 7 49 5 5 9 6 6 9 9 9 9 9 9 9 6 6 1 5 4 4 2 2 61 60 61 61 49 12 61 61 61 61 61 61 2 59 59 59 59 59 57 59 59 59 60 10 51 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/coproc.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Rusty Russell <rusty@rustcorp.com.au> * Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bitfield.h> #include <linux/bsearch.h> #include <linux/cacheinfo.h> #include <linux/debugfs.h> #include <linux/kvm_host.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/uaccess.h> #include <asm/arm_pmuv3.h> #include <asm/cacheflush.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/perf_event.h> #include <asm/sysreg.h> #include <trace/events/kvm.h> #include "sys_regs.h" #include "vgic/vgic.h" #include "trace.h" /* * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. */ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { kvm_inject_undefined(vcpu); return false; } static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, const char *msg) { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { return bad_trap(vcpu, params, r, "sys_reg read to write-only register"); } static bool write_to_read_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { return bad_trap(vcpu, params, r, "sys_reg write to read-only register"); } #define PURE_EL2_SYSREG(el2) \ case el2: { \ *el1r = el2; \ return true; \ } #define MAPPED_EL2_SYSREG(el2, el1, fn) \ case el2: { \ *xlate = fn; \ *el1r = el1; \ return true; \ } static bool get_el2_to_el1_mapping(unsigned int reg, unsigned int *el1r, u64 (**xlate)(u64)) { switch (reg) { PURE_EL2_SYSREG( VPIDR_EL2 ); PURE_EL2_SYSREG( VMPIDR_EL2 ); PURE_EL2_SYSREG( ACTLR_EL2 ); PURE_EL2_SYSREG( HCR_EL2 ); PURE_EL2_SYSREG( MDCR_EL2 ); PURE_EL2_SYSREG( HSTR_EL2 ); PURE_EL2_SYSREG( HACR_EL2 ); PURE_EL2_SYSREG( VTTBR_EL2 ); PURE_EL2_SYSREG( VTCR_EL2 ); PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); PURE_EL2_SYSREG( HCRX_EL2 ); PURE_EL2_SYSREG( HFGRTR_EL2 ); PURE_EL2_SYSREG( HFGWTR_EL2 ); PURE_EL2_SYSREG( HFGITR_EL2 ); PURE_EL2_SYSREG( HDFGRTR_EL2 ); PURE_EL2_SYSREG( HDFGWTR_EL2 ); PURE_EL2_SYSREG( HAFGRTR_EL2 ); PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1, translate_cptr_el2_to_cpacr_el1 ); MAPPED_EL2_SYSREG(TTBR0_EL2, TTBR0_EL1, translate_ttbr0_el2_to_ttbr0_el1 ); MAPPED_EL2_SYSREG(TTBR1_EL2, TTBR1_EL1, NULL ); MAPPED_EL2_SYSREG(TCR_EL2, TCR_EL1, translate_tcr_el2_to_tcr_el1 ); MAPPED_EL2_SYSREG(VBAR_EL2, VBAR_EL1, NULL ); MAPPED_EL2_SYSREG(AFSR0_EL2, AFSR0_EL1, NULL ); MAPPED_EL2_SYSREG(AFSR1_EL2, AFSR1_EL1, NULL ); MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } } u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; u64 (*xlate)(u64) = NULL; unsigned int el1r; if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) goto memory_read; if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { if (!is_hyp_ctxt(vcpu)) goto memory_read; /* * CNTHCTL_EL2 requires some special treatment to * account for the bits that can be set via CNTKCTL_EL1. */ switch (reg) { case CNTHCTL_EL2: if (vcpu_el2_e2h_is_set(vcpu)) { val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; return val; } break; } /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. */ if (reg == el1r) goto memory_read; /* * If we have a non-VHE guest and that the sysreg * requires translation to be used at EL1, use the * in-memory copy instead. */ if (!vcpu_el2_e2h_is_set(vcpu) && xlate) goto memory_read; /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); if (reg >= __SANITISED_REG_START__) val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); return val; } /* EL1 register can't be on the CPU if the guest is in vEL2. */ if (unlikely(is_hyp_ctxt(vcpu))) goto memory_read; if (__vcpu_read_sys_reg_from_cpu(reg, &val)) return val; memory_read: return __vcpu_sys_reg(vcpu, reg); } void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { u64 (*xlate)(u64) = NULL; unsigned int el1r; if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) goto memory_write; if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) { if (!is_hyp_ctxt(vcpu)) goto memory_write; /* * Always store a copy of the write to memory to avoid having * to reverse-translate virtual EL2 system registers for a * non-VHE guest hypervisor. */ __vcpu_sys_reg(vcpu, reg) = val; switch (reg) { case CNTHCTL_EL2: /* * If E2H=0, CNHTCTL_EL2 is a pure shadow register. * Otherwise, some of the bits are backed by * CNTKCTL_EL1, while the rest is kept in memory. * Yes, this is fun stuff. */ if (vcpu_el2_e2h_is_set(vcpu)) write_sysreg_el1(val, SYS_CNTKCTL); return; } /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; if (!vcpu_el2_e2h_is_set(vcpu) && xlate) val = xlate(val); /* Redirect this to the EL1 version of the register. */ WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r)); return; } /* EL1 register can't be on the CPU if the guest is in vEL2. */ if (unlikely(is_hyp_ctxt(vcpu))) goto memory_write; if (__vcpu_write_sys_reg_to_cpu(val, reg)) return; memory_write: __vcpu_sys_reg(vcpu, reg) = val; } /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 /* * Returns the minimum line size for the selected cache, expressed as * Log2(bytes). */ static u8 get_min_cache_line_size(bool icache) { u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0); u8 field; if (icache) field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr); else field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr); /* * Cache line size is represented as Log2(words) in CTR_EL0. * Log2(bytes) can be derived with the following: * * Log2(words) + 2 = Log2(bytes / 4) + 2 * = Log2(bytes) - 2 + 2 * = Log2(bytes) */ return field + 2; } /* Which cache CCSIDR represents depends on CSSELR value. */ static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr) { u8 line_size; if (vcpu->arch.ccsidr) return vcpu->arch.ccsidr[csselr]; line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD); /* * Fabricate a CCSIDR value as the overriding value does not exist. * The real CCSIDR value will not be used as it can vary by the * physical CPU which the vcpu currently resides in. * * The line size is determined with get_min_cache_line_size(), which * should be valid for all CPUs even if they have different cache * configuration. * * The associativity bits are cleared, meaning the geometry of all data * and unified caches (which are guaranteed to be PIPT and thus * non-aliasing) are 1 set and 1 way. * Guests should not be doing cache operations by set/way at all, and * for this reason, we trap them and attempt to infer the intent, so * that we can flush the entire guest's address space at the appropriate * time. The exposed geometry minimizes the number of the traps. * [If guests should attempt to infer aliasing properties from the * geometry (which is not permitted by the architecture), they would * only do so for virtually indexed caches.] * * We don't check if the cache level exists as it is allowed to return * an UNKNOWN value if not. */ return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4); } static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val) { u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4; u32 *ccsidr = vcpu->arch.ccsidr; u32 i; if ((val & CCSIDR_EL1_RES0) || line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD)) return -EINVAL; if (!ccsidr) { if (val == get_ccsidr(vcpu, csselr)) return 0; ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT); if (!ccsidr) return -ENOMEM; for (i = 0; i < CSSELR_MAX; i++) ccsidr[i] = get_ccsidr(vcpu, i); vcpu->arch.ccsidr = ccsidr; } ccsidr[csselr] = val; return 0; } static bool access_rw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, r->reg); else p->regval = vcpu_read_sys_reg(vcpu, r->reg); return true; } /* * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). */ static bool access_dcsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!p->is_write) return read_from_write_only(vcpu, p, r); /* * Only track S/W ops if we don't have FWB. It still indicates * that the guest is a bit broken (S/W operations should only * be done by firmware, knowing that there is only a single * CPU left in the system, and certainly not from non-secure * software). */ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) kvm_set_way_flush(vcpu); return true; } static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!kvm_has_mte(vcpu->kvm)) return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); } static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift) { switch (r->aarch32_map) { case AA32_LO: *mask = GENMASK_ULL(31, 0); *shift = 0; break; case AA32_HI: *mask = GENMASK_ULL(63, 32); *shift = 32; break; default: *mask = GENMASK_ULL(63, 0); *shift = 0; break; } } /* * Generic accessor for VM registers. Only called as long as HCR_TVM * is set. If the guest enables the MMU, we stop trapping the VM * sys_regs and leave it in complete control of the caches. */ static bool access_vm_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { bool was_enabled = vcpu_has_cache_enabled(vcpu); u64 val, mask, shift; BUG_ON(!p->is_write); get_access_mask(r, &mask, &shift); if (~mask) { val = vcpu_read_sys_reg(vcpu, r->reg); val &= ~mask; } else { val = 0; } val |= (p->regval & (mask >> shift)) << shift; vcpu_write_sys_reg(vcpu, val, r->reg); kvm_toggle_cache(vcpu, was_enabled); return true; } static bool access_actlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask, shift; if (p->is_write) return ignore_write(vcpu, p); get_access_mask(r, &mask, &shift); p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift; return true; } /* * Trap handler for the GICv3 SGI generation system register. * Forward the request to the VGIC emulation. * The cp15_64 code makes sure this automatically works * for both AArch64 and AArch32 accesses. */ static bool access_gic_sgi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { bool g1; if (!kvm_has_gicv3(vcpu->kvm)) return undef_access(vcpu, p, r); if (!p->is_write) return read_from_write_only(vcpu, p, r); /* * In a system where GICD_CTLR.DS=1, a ICC_SGI0R_EL1 access generates * Group0 SGIs only, while ICC_SGI1R_EL1 can generate either group, * depending on the SGI configuration. ICC_ASGI1R_EL1 is effectively * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure * group. */ if (p->Op0 == 0) { /* AArch32 */ switch (p->Op1) { default: /* Keep GCC quiet */ case 0: /* ICC_SGI1R */ g1 = true; break; case 1: /* ICC_ASGI1R */ case 2: /* ICC_SGI0R */ g1 = false; break; } } else { /* AArch64 */ switch (p->Op2) { default: /* Keep GCC quiet */ case 5: /* ICC_SGI1R_EL1 */ g1 = true; break; case 6: /* ICC_ASGI1R_EL1 */ case 7: /* ICC_SGI0R_EL1 */ g1 = false; break; } } vgic_v3_dispatch_sgi(vcpu, p->regval, g1); return true; } static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!kvm_has_gicv3(vcpu->kvm)) return undef_access(vcpu, p, r); if (p->is_write) return ignore_write(vcpu, p); p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; return true; } static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return ignore_write(vcpu, p); else return read_zero(vcpu, p); } /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 * system, these registers should UNDEF. LORID_EL1 being a RO register, we * treat it separately. */ static bool trap_loregion(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sr = reg_to_encoding(r); if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); return trap_raz_wi(vcpu, p, r); } static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!p->is_write) return read_from_write_only(vcpu, p, r); kvm_debug_handle_oslar(vcpu, p->regval); return true; } static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { /* * The only modifiable bit is the OSLK bit. Refuse the write if * userspace attempts to change any other bit in the register. */ if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) return -EINVAL; __vcpu_sys_reg(vcpu, rd->reg) = val; return 0; } static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { return ignore_write(vcpu, p); } else { p->regval = read_sysreg(dbgauthstatus_el1); return true; } } static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { access_rw(vcpu, p, r); kvm_debug_set_guest_ownership(vcpu); return true; } /* * reg_to_dbg/dbg_to_reg * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd, u64 *dbg_reg) { u64 mask, shift, val; get_access_mask(rd, &mask, &shift); val = *dbg_reg; val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; } static void dbg_to_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd, u64 *dbg_reg) { u64 mask, shift; get_access_mask(rd, &mask, &shift); p->regval = (*dbg_reg & mask) >> shift; } static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; switch (rd->Op2) { case 0b100: return &dbg->dbg_bvr[rd->CRm]; case 0b101: return &dbg->dbg_bcr[rd->CRm]; case 0b110: return &dbg->dbg_wvr[rd->CRm]; case 0b111: return &dbg->dbg_wcr[rd->CRm]; default: KVM_BUG_ON(1, vcpu->kvm); return NULL; } } static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return false; if (p->is_write) reg_to_dbg(vcpu, p, rd, reg); else dbg_to_reg(vcpu, p, rd, reg); kvm_debug_set_guest_ownership(vcpu); return true; } static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return -EINVAL; *reg = val; return 0; } static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { u64 *reg = demux_wb_reg(vcpu, rd); if (!reg) return -EINVAL; *val = *reg; return 0; } static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { u64 *reg = demux_wb_reg(vcpu, rd); /* * Bail early if we couldn't find storage for the register, the * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever * being run. */ if (!reg) return 0; *reg = rd->val; return rd->val; } static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair = read_sysreg(amair_el1); vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); return amair; } static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 actlr = read_sysreg(actlr_el1); vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); return actlr; } static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mpidr; /* * Map the vcpu_id into the first three affinity level fields of * the MPIDR. We limit the number of VCPUs in level 0 due to a * limitation to 16 CPUs in that level in the ICC_SGIxR registers * of the GICv3 to be able to address each CPU directly when * sending IPIs. */ mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); mpidr |= (1ULL << 31); vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); return mpidr; } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (kvm_vcpu_has_pmu(vcpu)) return 0; return REG_HIDDEN; } static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); u8 n = vcpu->kvm->arch.pmcr_n; if (n) mask |= GENMASK(n - 1, 0); reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= mask; return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { /* This thing will UNDEF, who cares about the reset value? */ if (!kvm_vcpu_has_pmu(vcpu)) return 0; reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm); return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; return __vcpu_sys_reg(vcpu, r->reg); } static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr = 0; if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; /* * The value of PMCR.N field is included when the * vCPU register is read via kvm_vcpu_read_pmcr(). */ __vcpu_sys_reg(vcpu, r->reg) = pmcr; return __vcpu_sys_reg(vcpu, r->reg); } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) { u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); if (!enabled) kvm_inject_undefined(vcpu); return !enabled; } static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); } static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN); } static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN); } static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN); } static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 val; if (pmu_access_el0_disabled(vcpu)) return false; if (p->is_write) { /* * Only update writeable bits of PMCR (continuing into * kvm_pmu_handle_pmcr() as well) */ val = kvm_vcpu_read_pmcr(vcpu); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; kvm_pmu_handle_pmcr(vcpu, val); } else { /* PMCR.P & PMCR.C are RAZ */ val = kvm_vcpu_read_pmcr(vcpu) & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); p->regval = val; } return true; } static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (pmu_access_event_counter_el0_disabled(vcpu)) return false; if (p->is_write) __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) & PMSELR_EL0_SEL_MASK; return true; } static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 pmceid, mask, shift; BUG_ON(p->is_write); if (pmu_access_el0_disabled(vcpu)) return false; get_access_mask(r, &mask, &shift); pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); pmceid &= mask; pmceid >>= shift; p->regval = pmceid; return true; } static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) { u64 pmcr, val; pmcr = kvm_vcpu_read_pmcr(vcpu); val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; } return true; } static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u64 idx; if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) /* PMCCNTR_EL0 */ idx = ARMV8_PMU_CYCLE_IDX; else /* PMEVCNTRn_EL0 */ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); *val = kvm_pmu_get_counter_value(vcpu, idx); return 0; } static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 idx = ~0UL; if (r->CRn == 9 && r->CRm == 13) { if (r->Op2 == 2) { /* PMXEVCNTR_EL0 */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); } else if (r->Op2 == 0) { /* PMCCNTR_EL0 */ if (pmu_access_cycle_counter_el0_disabled(vcpu)) return false; idx = ARMV8_PMU_CYCLE_IDX; } } else if (r->CRn == 0 && r->CRm == 9) { /* PMCCNTR */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = ARMV8_PMU_CYCLE_IDX; } else if (r->CRn == 14 && (r->CRm & 12) == 8) { /* PMEVCNTRn_EL0 */ if (pmu_access_event_counter_el0_disabled(vcpu)) return false; idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); } /* Catch any decoding mistake */ WARN_ON(idx == ~0UL); if (!pmu_counter_idx_valid(vcpu, idx)) return false; if (p->is_write) { if (pmu_access_el0_disabled(vcpu)) return false; kvm_pmu_set_counter_value(vcpu, idx, p->regval); } else { p->regval = kvm_pmu_get_counter_value(vcpu, idx); } return true; } static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 idx, reg; if (pmu_access_el0_disabled(vcpu)) return false; if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); if (idx == ARMV8_PMU_CYCLE_IDX) reg = PMCCFILTR_EL0; else /* PMEVTYPERn_EL0 */ reg = PMEVTYPER0_EL0 + idx; } else { BUG(); } if (!pmu_counter_idx_valid(vcpu, idx)) return false; if (p->is_write) { kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); kvm_vcpu_pmu_restore_guest(vcpu); } else { p->regval = __vcpu_sys_reg(vcpu, reg); } return true; } static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { bool set; val &= kvm_pmu_accessible_counter_mask(vcpu); switch (r->reg) { case PMOVSSET_EL0: /* CRm[1] being set indicates a SET register, and CLR otherwise */ set = r->CRm & 2; break; default: /* Op2[0] being set indicates a SET register, and CLR otherwise */ set = r->Op2 & 1; break; } if (set) __vcpu_sys_reg(vcpu, r->reg) |= val; else __vcpu_sys_reg(vcpu, r->reg) &= ~val; return 0; } static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; } static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 val, mask; if (pmu_access_el0_disabled(vcpu)) return false; mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; else /* accessing PMCNTENCLR_EL0 */ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; kvm_pmu_reprogram_counter_mask(vcpu, val); } else { p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; } static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; if (p->is_write) { u64 val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMINTENSET_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; else /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; } static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); else /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; } static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 mask; if (!p->is_write) return read_from_write_only(vcpu, p, r); if (pmu_write_swinc_el0_disabled(vcpu)) return false; mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { if (!vcpu_mode_priv(vcpu)) return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; } else { p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) & ARMV8_PMU_USERENR_MASK; } return true; } static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { *val = kvm_vcpu_read_pmcr(vcpu); return 0; } static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); struct kvm *kvm = vcpu->kvm; mutex_lock(&kvm->arch.config_lock); /* * The vCPU can't have more counters than the PMU hardware * implements. Ignore this error to maintain compatibility * with the existing KVM behavior. */ if (!kvm_vm_has_ran_once(kvm) && new_n <= kvm_arm_pmu_get_max_counters(kvm)) kvm->arch.pmcr_n = new_n; mutex_unlock(&kvm->arch.config_lock); /* * Ignore writes to RES0 bits, read only bits that are cleared on * vCPU reset, and writable bits that KVM doesn't support yet. * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. * But, we leave the bit as it is here, as the vCPU's PMUver might * be changed later (NOTE: the bit will be cleared on first vCPU run * if necessary). */ val &= ARMV8_PMU_PMCR_MASK; /* The LC bit is RES1 when AArch32 is not supported */ if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = val; return 0; } /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ get_dbg_wb_reg, set_dbg_wb_reg } #define PMU_SYS_REG(name) \ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(PMEVCNTRn_EL0(n)), \ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(PMEVTYPERn_EL0(n)), \ .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), undef_access } #define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), undef_access } static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN; } /* * If we land here on a PtrAuth access, that is because we didn't * fixup the access on exit by allowing the PtrAuth sysregs. The only * way this happens is when the guest does not have PtrAuth support * enabled. */ #define __PTRAUTH_KEY(k) \ { SYS_DESC(SYS_## k), undef_access, reset_unknown, k, \ .visibility = ptrauth_visibility} #define PTRAUTH_KEY(k) \ __PTRAUTH_KEY(k ## KEYLO_EL1), \ __PTRAUTH_KEY(k ## KEYHI_EL1) static bool access_arch_timer(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { enum kvm_arch_timers tmr; enum kvm_arch_timer_regs treg; u64 reg = reg_to_encoding(r); switch (reg) { case SYS_CNTP_TVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTV_TVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_TVAL; break; case SYS_AARCH32_CNTP_TVAL: case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTV_TVAL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTHP_TVAL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTHV_TVAL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_TVAL; break; case SYS_CNTP_CTL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTV_CTL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CTL; break; case SYS_AARCH32_CNTP_CTL: case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTV_CTL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTHP_CTL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTHV_CTL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_CTL; break; case SYS_CNTP_CVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTV_CVAL_EL0: if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CVAL; break; case SYS_AARCH32_CNTP_CVAL: case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTV_CVAL_EL02: tmr = TIMER_VTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTHP_CVAL_EL2: tmr = TIMER_HPTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTHV_CVAL_EL2: tmr = TIMER_HVTIMER; treg = TIMER_REG_CVAL; break; case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (is_hyp_ctxt(vcpu)) tmr = TIMER_HPTIMER; else tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; case SYS_AARCH32_CNTPCT: case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; case SYS_CNTVCT_EL0: case SYS_CNTVCTSS_EL0: if (is_hyp_ctxt(vcpu)) tmr = TIMER_HVTIMER; else tmr = TIMER_VTIMER; treg = TIMER_REG_CNT; break; case SYS_AARCH32_CNTVCT: case SYS_AARCH32_CNTVCTSS: tmr = TIMER_VTIMER; treg = TIMER_REG_CNT; break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); return undef_access(vcpu, p, r); } if (p->is_write) kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); else p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } static bool access_hv_timer(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (!vcpu_el2_e2h_is_set(vcpu)) return undef_access(vcpu, p, r); return access_arch_timer(vcpu, p, r); } static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { struct arm64_ftr_bits kvm_ftr = *ftrp; /* Some features have different safe value type in KVM than host features */ switch (id) { case SYS_ID_AA64DFR0_EL1: switch (kvm_ftr.shift) { case ID_AA64DFR0_EL1_PMUVer_SHIFT: kvm_ftr.type = FTR_LOWER_SAFE; break; case ID_AA64DFR0_EL1_DebugVer_SHIFT: kvm_ftr.type = FTR_LOWER_SAFE; break; } break; case SYS_ID_DFR0_EL1: if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) kvm_ftr.type = FTR_LOWER_SAFE; break; } return arm64_ftr_safe_value(&kvm_ftr, new, cur); } /* * arm64_check_features() - Check if a feature register value constitutes * a subset of features indicated by the idreg's KVM sanitised limit. * * This function will check if each feature field of @val is the "safe" value * against idreg's KVM sanitised limit return from reset() callback. * If a field value in @val is the same as the one in limit, it is always * considered the safe value regardless For register fields that are not in * writable, only the value in limit is considered the safe value. * * Return: 0 if all the fields are safe. Otherwise, return negative errno. */ static int arm64_check_features(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { const struct arm64_ftr_reg *ftr_reg; const struct arm64_ftr_bits *ftrp = NULL; u32 id = reg_to_encoding(rd); u64 writable_mask = rd->val; u64 limit = rd->reset(vcpu, rd); u64 mask = 0; /* * Hidden and unallocated ID registers may not have a corresponding * struct arm64_ftr_reg. Of course, if the register is RAZ we know the * only safe value is 0. */ if (sysreg_visible_as_raz(vcpu, rd)) return val ? -E2BIG : 0; ftr_reg = get_arm64_ftr_reg(id); if (!ftr_reg) return -EINVAL; ftrp = ftr_reg->ftr_bits; for (; ftrp && ftrp->width; ftrp++) { s64 f_val, f_lim, safe_val; u64 ftr_mask; ftr_mask = arm64_ftr_mask(ftrp); if ((ftr_mask & writable_mask) != ftr_mask) continue; f_val = arm64_ftr_value(ftrp, val); f_lim = arm64_ftr_value(ftrp, limit); mask |= ftr_mask; if (f_val == f_lim) safe_val = f_val; else safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); if (safe_val != f_val) return -E2BIG; } /* For fields that are not writable, values in limit are the safe values. */ if ((val & ~mask) != (limit & ~mask)) return -E2BIG; return 0; } static u8 pmuver_to_perfmon(u8 pmuver) { switch (pmuver) { case ID_AA64DFR0_EL1_PMUVer_IMP: return ID_DFR0_EL1_PerfMon_PMUv3; case ID_AA64DFR0_EL1_PMUVer_IMP_DEF: return ID_DFR0_EL1_PerfMon_IMPDEF; default: /* Anything ARMv8.1+ and NI have the same value. For now. */ return pmuver; } } static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); u64 val; if (sysreg_visible_as_raz(vcpu, r)) return 0; val = read_sanitised_ftr_reg(id); switch (id) { case SYS_ID_AA64DFR0_EL1: val = sanitise_id_aa64dfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR0_EL1: val = sanitise_id_aa64pfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64PFR2_EL1: /* We only expose FPMR */ val &= ID_AA64PFR2_EL1_FPMR; break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); break; case SYS_ID_AA64ISAR2_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); if (!cpus_have_final_cap(ARM64_HAS_WFXT) || has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; case SYS_ID_AA64ISAR3_EL1: val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX; break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; case SYS_ID_AA64MMFR3_EL1: val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); break; } return val; } static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return __kvm_read_sanitised_id_reg(vcpu, r); } static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r)); } static bool is_feature_id_reg(u32 encoding) { return (sys_reg_Op0(encoding) == 3 && (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) && sys_reg_CRn(encoding) == 0 && sys_reg_CRm(encoding) <= 7); } /* * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID * registers KVM maintains on a per-VM basis. */ static inline bool is_vm_ftr_id_reg(u32 id) { if (id == SYS_CTR_EL0) return true; return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && sys_reg_CRm(id) < 8); } static inline bool is_vcpu_ftr_id_reg(u32 id) { return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id); } static inline bool is_aa32_id_reg(u32 id) { return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && sys_reg_CRm(id) <= 3); } static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); switch (id) { case SYS_ID_AA64ZFR0_EL1: if (!vcpu_has_sve(vcpu)) return REG_RAZ; break; } return 0; } static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { /* * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any * EL. Promote to RAZ/WI in order to guarantee consistency between * systems. */ if (!kvm_supports_32bit_el0()) return REG_RAZ | REG_USER_WI; return id_visibility(vcpu, r); } static unsigned int raz_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return REG_RAZ; } /* cpufeature ID register access trap handlers */ static bool access_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = read_id_reg(vcpu, r); return true; } /* Visibility overrides for SVE-specific control registers */ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (vcpu_has_sve(vcpu)) return 0; return REG_HIDDEN; } static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) return 0; return REG_HIDDEN; } static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_fpmr(vcpu->kvm)) return 0; return REG_HIDDEN; } static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; /* * The default is to expose CSV2 == 1 if the HW isn't affected. * Although this is a per-CPU feature, we make it global because * asymmetric systems are just a nuisance. * * Userspace can override this as long as it doesn't promise * the impossible. */ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { val &= ~ID_AA64PFR0_EL1_CSV2_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); } if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { val &= ~ID_AA64PFR0_EL1_CSV3_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); } if (kvm_vgic_global_state.type == VGIC_V3) { val &= ~ID_AA64PFR0_EL1_GIC_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); } val &= ~ID_AA64PFR0_EL1_AMU_MASK; /* * MPAM is disabled by default as KVM also needs a set of PARTID to * program the MPAMVPMx_EL2 PARTID remapping registers with. But some * older kernels let the guest see the ID bit. */ val &= ~ID_AA64PFR0_EL1_MPAM_MASK; return val; } #define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ ({ \ u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ (val) &= ~reg##_##field##_MASK; \ (val) |= FIELD_PREP(reg##_##field##_MASK, \ min(__f_val, \ (u64)SYS_FIELD_VALUE(reg, field, limit))); \ (val); \ }) static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* * Only initialize the PMU version if the vCPU was configured with one. */ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; if (kvm_vcpu_has_pmu(vcpu)) val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, kvm_arm_pmu_get_pmuver_limit()); /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; /* Hide BRBE from guests */ val &= ~ID_AA64DFR0_EL1_BRBE_MASK; return val; } static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously * exposed an IMP_DEF PMU to userspace and the guest on systems w/ * non-architectural PMUs. Of course, PMUv3 is the only game in town for * PMU virtualization, so the IMP_DEF value was rather user-hostile. * * At minimum, we're on the hook to allow values that were given to * userspace by KVM. Cover our tracks here and replace the IMP_DEF value * with a more sensible NI. The value of an ID register changing under * the nose of the guest is unfortunate, but is certainly no more * surprising than an ill-guided PMU driver poking at impdef system * registers that end in an UNDEF... */ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; /* * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a * nonzero minimum safe value. */ if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP) return -EINVAL; return set_id_reg(vcpu, rd, val); } static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); val &= ~ID_DFR0_EL1_PerfMon_MASK; if (kvm_vcpu_has_pmu(vcpu)) val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8); return val; } static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val); if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { val &= ~ID_DFR0_EL1_PerfMon_MASK; perfmon = 0; } /* * Allow DFR0_EL1.PerfMon to be set from userspace as long as * it doesn't promise more than what the HW gives us on the * AArch64 side (as everything is emulated with that), and * that this is a PMUv3. */ if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) return -EINVAL; if (copdbg < ID_DFR0_EL1_CopDbg_Armv8) return -EINVAL; return set_id_reg(vcpu, rd, val); } static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; /* * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to * guests, but didn't add trap handling. KVM doesn't support MPAM and * always returns an UNDEF for these registers. The guest must see 0 * for this field. * * But KVM must also accept values from user-space that were provided * by KVM. On CPUs that support MPAM, permit user-space to write * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. */ if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; return set_id_reg(vcpu, rd, user_val); } static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; /* See set_id_aa64pfr0_el1 for comment about MPAM */ if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; return set_id_reg(vcpu, rd, user_val); } static int set_ctr_el0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 user_val) { u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); /* * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based * on what hardware reports. * * Using a VIPT software model on PIPT will lead to over invalidation, * but still correct. Hence, we can allow downgrading PIPT to VIPT, * but not the other way around. This is handled via arm64_ftr_safe_value() * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value * set as VIPT. */ switch (user_L1Ip) { case CTR_EL0_L1Ip_RESERVED_VPIPT: case CTR_EL0_L1Ip_RESERVED_AIVIVT: return -EINVAL; case CTR_EL0_L1Ip_VIPT: case CTR_EL0_L1Ip_PIPT: return set_id_reg(vcpu, rd, user_val); default: return -ENOENT; } } /* * cpufeature ID register user accessors * * For now, these registers are immutable for userspace, so no values * are stored, and for set_id_reg() we don't allow the effective value * to be changed. */ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { /* * Avoid locking if the VM has already started, as the ID registers are * guaranteed to be invariant at that point. */ if (kvm_vm_has_ran_once(vcpu->kvm)) { *val = read_id_reg(vcpu, rd); return 0; } mutex_lock(&vcpu->kvm->arch.config_lock); *val = read_id_reg(vcpu, rd); mutex_unlock(&vcpu->kvm->arch.config_lock); return 0; } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u32 id = reg_to_encoding(rd); int ret; mutex_lock(&vcpu->kvm->arch.config_lock); /* * Once the VM has started the ID registers are immutable. Reject any * write that does not match the final register value. */ if (kvm_vm_has_ran_once(vcpu->kvm)) { if (val != read_id_reg(vcpu, rd)) ret = -EBUSY; else ret = 0; mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; } ret = arm64_check_features(vcpu, rd, val); if (!ret) kvm_set_vm_id_reg(vcpu->kvm, id, val); mutex_unlock(&vcpu->kvm->arch.config_lock); /* * arm64_check_features() returns -E2BIG to indicate the register's * feature set is a superset of the maximally-allowed register value. * While it would be nice to precisely describe this to userspace, the * existing UAPI for KVM_SET_ONE_REG has it that invalid register * writes return -EINVAL. */ if (ret == -E2BIG) ret = -EINVAL; return ret; } void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val) { u64 *p = __vm_id_reg(&kvm->arch, reg); lockdep_assert_held(&kvm->arch.config_lock); if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm)) return; *p = val; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { *val = 0; return 0; } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { return 0; } static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0); return true; } static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) return write_to_read_only(vcpu, p, r); p->regval = __vcpu_sys_reg(vcpu, r->reg); return true; } /* * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary * by the physical CPU which the vcpu currently resides in. */ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); u64 clidr; u8 loc; if ((ctr_el0 & CTR_EL0_IDC)) { /* * Data cache clean to the PoU is not required so LoUU and LoUIS * will not be set and a unified cache, which will be marked as * LoC, will be added. * * If not DIC, let the unified cache L2 so that an instruction * cache can be added as L1 later. */ loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2; clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc); } else { /* * Data cache clean to the PoU is required so let L1 have a data * cache and mark it as LoUU and LoUIS. As L1 has a data cache, * it can be marked as LoC too. */ loc = 1; clidr = 1 << CLIDR_LOUU_SHIFT; clidr |= 1 << CLIDR_LOUIS_SHIFT; clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1); } /* * Instruction cache invalidation to the PoU is required so let L1 have * an instruction cache. If L1 already has a data cache, it will be * CACHE_TYPE_SEPARATE. */ if (!(ctr_el0 & CTR_EL0_DIC)) clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1); clidr |= loc << CLIDR_LOC_SHIFT; /* * Add tag cache unified to data cache. Allocation tags and data are * unified in a cache line so that it looks valid even if there is only * one cache line. */ if (kvm_has_mte(vcpu->kvm)) clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); __vcpu_sys_reg(vcpu, r->reg) = clidr; return __vcpu_sys_reg(vcpu, r->reg); } static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val)); if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) return -EINVAL; __vcpu_sys_reg(vcpu, rd->reg) = val; return 0; } static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { int reg = r->reg; if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, reg); else p->regval = vcpu_read_sys_reg(vcpu, reg); return true; } static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 csselr; if (p->is_write) return write_to_read_only(vcpu, p, r); csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD; if (csselr < CSSELR_MAX) p->regval = get_ccsidr(vcpu, csselr); return true; } static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_mte(vcpu->kvm)) return 0; return REG_HIDDEN; } #define MTE_REG(name) { \ SYS_DESC(SYS_##name), \ .access = undef_access, \ .reset = reset_unknown, \ .reg = name, \ .visibility = mte_visibility, \ } static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (vcpu_has_nv(vcpu)) return 0; return REG_HIDDEN; } static bool bad_vncr_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { /* * We really shouldn't be here, and this is likely the result * of a misconfigured trap, as this register should target the * VNCR page, and nothing else. */ return bad_trap(vcpu, p, r, "trap of VNCR-backed register"); } static bool bad_redir_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { /* * We really shouldn't be here, and this is likely the result * of a misconfigured trap, as this register should target the * corresponding EL1, and nothing else. */ return bad_trap(vcpu, p, r, "trap of EL2 register redirected to EL1"); } #define EL2_REG(name, acc, rst, v) { \ SYS_DESC(SYS_##name), \ .access = acc, \ .reset = rst, \ .reg = name, \ .visibility = el2_visibility, \ .val = v, \ } #define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ SYS_DESC(SYS_##name), \ .access = acc, \ .reset = rst, \ .reg = name, \ .visibility = filter, \ .val = v, \ } #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) /* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. * The reset() would return KVM sanitised register value. The value would be the * same as the host kernel sanitised value if there is no KVM sanitisation. * The val would be used as a mask indicating writable fields for the idreg. * Only bits with 1 are writable from userspace. This mask might not be * necessary in the future whenever all ID registers are enabled as writable * from userspace. */ #define ID_DESC(name) \ SYS_DESC(SYS_##name), \ .access = access_id_reg, \ .get_user = get_id_reg \ /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ ID_DESC(name), \ .set_user = set_id_reg, \ .visibility = id_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ #define AA32_ID_SANITISED(name) { \ ID_DESC(name), \ .set_user = set_id_reg, \ .visibility = aa32_id_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ .set_user = set_id_reg, \ .visibility = id_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = mask, \ } /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ .set_user = set_##name, \ .visibility = id_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = (mask), \ } /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 * (1 <= crm < 8, 0 <= Op2 < 8). */ #define ID_UNALLOCATED(crm, op2) { \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = raz_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } /* * sys_reg_desc initialiser for known ID registers that we hide from guests. * For now, these are exposed just like unallocated ID regs: they appear * RAZ for the guest. */ #define ID_HIDDEN(name) { \ ID_DESC(name), \ .set_user = set_id_reg, \ .visibility = raz_visibility, \ .reset = kvm_read_sanitised_id_reg, \ .val = 0, \ } static bool access_sp_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, SP_EL1); return true; } static bool access_elr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1); else p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1); return true; } static bool access_spsr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); return true; } static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; else p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); return true; } static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) val |= HCR_E2H; return __vcpu_sys_reg(vcpu, r->reg) = val; } static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, unsigned int (*fn)(const struct kvm_vcpu *, const struct sys_reg_desc *)) { return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); } static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, sve_visibility); } static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { unsigned int vq; if (guest_hyp_sve_traps_enabled(vcpu)) { kvm_inject_nested_sve_trap(vcpu); return true; } if (!p->is_write) { p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2); return true; } vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; vq = min(vq, vcpu_sve_max_vq(vcpu)); vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2); return true; } static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_s1poe(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, s1poe_visibility); } static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_tcr2(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, tcr2_visibility); } static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { if (kvm_has_s1pie(vcpu->kvm)) return 0; return REG_HIDDEN; } static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { return __el2_visibility(vcpu, rd, s1pie_visibility); } static bool access_mdcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!access_rw(vcpu, p, r)) return false; /* * Request a reload of the PMU to enable/disable the counters affected * by HPME. */ if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return true; } /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 * * Debug handling: We do trap most, if not all debug related system * registers. The implementation is good enough to ensure that a guest * can use these with minimal performance degradation. The drawback is * that we don't implement any of the external debug architecture. * This should be revisited if we ever encounter a more demanding * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { DBG_BCR_BVR_WCR_WVR_EL1(0), DBG_BCR_BVR_WCR_WVR_EL1(1), { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, { SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 }, DBG_BCR_BVR_WCR_WVR_EL1(2), DBG_BCR_BVR_WCR_WVR_EL1(3), DBG_BCR_BVR_WCR_WVR_EL1(4), DBG_BCR_BVR_WCR_WVR_EL1(5), DBG_BCR_BVR_WCR_WVR_EL1(6), DBG_BCR_BVR_WCR_WVR_EL1(7), DBG_BCR_BVR_WCR_WVR_EL1(8), DBG_BCR_BVR_WCR_WVR_EL1(9), DBG_BCR_BVR_WCR_WVR_EL1(10), DBG_BCR_BVR_WCR_WVR_EL1(11), DBG_BCR_BVR_WCR_WVR_EL1(12), DBG_BCR_BVR_WCR_WVR_EL1(13), DBG_BCR_BVR_WCR_WVR_EL1(14), DBG_BCR_BVR_WCR_WVR_EL1(15), { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 }, { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1, OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, }, { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 }, { SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi }, { SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi }, // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, /* * ID regs: all ID_SANITISED() entries here must have corresponding * entries in arm64_ftr_regs[]. */ /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, .set_user = set_id_dfr0_el1, .visibility = aa32_id_visibility, .reset = read_sanitised_id_dfr0_el1, .val = ID_DFR0_EL1_PerfMon_MASK | ID_DFR0_EL1_CopDbg_MASK, }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), AA32_ID_SANITISED(ID_MMFR2_EL1), AA32_ID_SANITISED(ID_MMFR3_EL1), /* CRm=2 */ AA32_ID_SANITISED(ID_ISAR0_EL1), AA32_ID_SANITISED(ID_ISAR1_EL1), AA32_ID_SANITISED(ID_ISAR2_EL1), AA32_ID_SANITISED(ID_ISAR3_EL1), AA32_ID_SANITISED(ID_ISAR4_EL1), AA32_ID_SANITISED(ID_ISAR5_EL1), AA32_ID_SANITISED(ID_MMFR4_EL1), AA32_ID_SANITISED(ID_ISAR6_EL1), /* CRm=3 */ AA32_ID_SANITISED(MVFR0_EL1), AA32_ID_SANITISED(MVFR1_EL1), AA32_ID_SANITISED(MVFR2_EL1), ID_UNALLOCATED(3,3), AA32_ID_SANITISED(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), AA32_ID_SANITISED(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ /* CRm=4 */ ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, ~(ID_AA64PFR0_EL1_AMU | ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | ID_AA64PFR1_EL1_THE | ID_AA64PFR1_EL1_GCS | ID_AA64PFR1_EL1_MTE_frac | ID_AA64PFR1_EL1_NMI | ID_AA64PFR1_EL1_RNDR_trap | ID_AA64PFR1_EL1_SME | ID_AA64PFR1_EL1_RES0 | ID_AA64PFR1_EL1_MPAM_frac | ID_AA64PFR1_EL1_RAS_frac | ID_AA64PFR1_EL1_MTE)), ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ /* * Prior to FEAT_Debugv8.9, the architecture defines context-aware * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). * KVM does not trap + emulate the breakpoint registers, and as such * cannot support a layout that misaligns with the underlying hardware. * While it may be possible to describe a subset that aligns with * hardware, just prevent changes to BRPs and CTX_CMPs altogether for * simplicity. * * See DDI0487K.a, section D2.8.3 Breakpoint types and linking * of breakpoints for more details. */ ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, ID_AA64DFR0_EL1_DoubleLock_MASK | ID_AA64DFR0_EL1_WRPs_MASK | ID_AA64DFR0_EL1_PMUVer_MASK | ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), ID_HIDDEN(ID_AA64AFR0_EL1), ID_HIDDEN(ID_AA64AFR1_EL1), ID_UNALLOCATED(5,6), ID_UNALLOCATED(5,7), /* CRm=6 */ ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0), ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI | ID_AA64ISAR1_EL1_GPA | ID_AA64ISAR1_EL1_API | ID_AA64ISAR1_EL1_APA)), ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 | ID_AA64ISAR2_EL1_APA3 | ID_AA64ISAR2_EL1_GPA3)), ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), ID_UNALLOCATED(6,6), ID_UNALLOCATED(6,7), /* CRm=7 */ ID_WRITABLE(ID_AA64MMFR0_EL1, ~(ID_AA64MMFR0_EL1_RES0 | ID_AA64MMFR0_EL1_TGRAN4_2 | ID_AA64MMFR0_EL1_TGRAN64_2 | ID_AA64MMFR0_EL1_TGRAN16_2 | ID_AA64MMFR0_EL1_ASIDBITS)), ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 | ID_AA64MMFR1_EL1_HCX | ID_AA64MMFR1_EL1_TWED | ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_VH | ID_AA64MMFR1_EL1_VMIDBits)), ID_WRITABLE(ID_AA64MMFR2_EL1, ~(ID_AA64MMFR2_EL1_RES0 | ID_AA64MMFR2_EL1_EVT | ID_AA64MMFR2_EL1_FWB | ID_AA64MMFR2_EL1_IDS | ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1PIE | ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, MTE_REG(RGSR_EL1), MTE_REG(GCR_EL1), { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TRFCR_EL1), undef_access }, { SYS_DESC(SYS_SMPRI_EL1), undef_access }, { SYS_DESC(SYS_SMCR_EL1), undef_access }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), PTRAUTH_KEY(APDA), PTRAUTH_KEY(APDB), PTRAUTH_KEY(APGA), { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi }, MTE_REG(TFSR_EL1), MTE_REG(TFSRE0_EL1), { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, { SYS_DESC(SYS_PMSCR_EL1), undef_access }, { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access }, { SYS_DESC(SYS_PMSICR_EL1), undef_access }, { SYS_DESC(SYS_PMSIRR_EL1), undef_access }, { SYS_DESC(SYS_PMSFCR_EL1), undef_access }, { SYS_DESC(SYS_PMSEVFR_EL1), undef_access }, { SYS_DESC(SYS_PMSLATFR_EL1), undef_access }, { SYS_DESC(SYS_PMSIDR_EL1), undef_access }, { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, { SYS_DESC(SYS_PMBSR_EL1), undef_access }, /* PMBIDR_EL1 is not trapped */ { PMU_SYS_REG(PMINTENSET_EL1), .access = access_pminten, .reg = PMINTENSET_EL1, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMINTENCLR_EL1), .access = access_pminten, .reg = PMINTENSET_EL1, .get_user = get_pmreg, .set_user = set_pmreg }, { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, .visibility = s1pie_visibility }, { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, .visibility = s1pie_visibility }, { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, { SYS_DESC(SYS_ACCDATA_EL1), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL1), undef_access }, { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1, .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 }, { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, ID_FILTERED(CTR_EL0, ctr_el0, CTR_EL0_DIC_MASK | CTR_EL0_IDC_MASK | CTR_EL0_DminLine_MASK | CTR_EL0_L1Ip_MASK | CTR_EL0_IminLine_MASK), { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, { PMU_SYS_REG(PMCNTENSET_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMCNTENCLR_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { PMU_SYS_REG(PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, /* * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was * previously (and pointlessly) advertised in the past... */ { PMU_SYS_REG(PMSWINC_EL0), .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(PMCCNTR_EL0), .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, { PMU_SYS_REG(PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(PMXEVCNTR_EL0), .access = access_pmu_evcntr, .reset = NULL }, /* * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr, .reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 }, { PMU_SYS_REG(PMOVSSET_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0, .get_user = get_pmreg, .set_user = set_pmreg }, { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0, .visibility = s1poe_visibility }, { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, { SYS_DESC(SYS_TPIDR2_EL0), undef_access }, { SYS_DESC(SYS_SCXTNUM_EL0), undef_access }, { SYS_DESC(SYS_AMCR_EL0), undef_access }, { SYS_DESC(SYS_AMCFGR_EL0), undef_access }, { SYS_DESC(SYS_AMCGCR_EL0), undef_access }, { SYS_DESC(SYS_AMUSERENR_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENCLR0_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENSET0_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENCLR1_EL0), undef_access }, { SYS_DESC(SYS_AMCNTENSET1_EL0), undef_access }, AMU_AMEVCNTR0_EL0(0), AMU_AMEVCNTR0_EL0(1), AMU_AMEVCNTR0_EL0(2), AMU_AMEVCNTR0_EL0(3), AMU_AMEVCNTR0_EL0(4), AMU_AMEVCNTR0_EL0(5), AMU_AMEVCNTR0_EL0(6), AMU_AMEVCNTR0_EL0(7), AMU_AMEVCNTR0_EL0(8), AMU_AMEVCNTR0_EL0(9), AMU_AMEVCNTR0_EL0(10), AMU_AMEVCNTR0_EL0(11), AMU_AMEVCNTR0_EL0(12), AMU_AMEVCNTR0_EL0(13), AMU_AMEVCNTR0_EL0(14), AMU_AMEVCNTR0_EL0(15), AMU_AMEVTYPER0_EL0(0), AMU_AMEVTYPER0_EL0(1), AMU_AMEVTYPER0_EL0(2), AMU_AMEVTYPER0_EL0(3), AMU_AMEVTYPER0_EL0(4), AMU_AMEVTYPER0_EL0(5), AMU_AMEVTYPER0_EL0(6), AMU_AMEVTYPER0_EL0(7), AMU_AMEVTYPER0_EL0(8), AMU_AMEVTYPER0_EL0(9), AMU_AMEVTYPER0_EL0(10), AMU_AMEVTYPER0_EL0(11), AMU_AMEVTYPER0_EL0(12), AMU_AMEVTYPER0_EL0(13), AMU_AMEVTYPER0_EL0(14), AMU_AMEVTYPER0_EL0(15), AMU_AMEVCNTR1_EL0(0), AMU_AMEVCNTR1_EL0(1), AMU_AMEVCNTR1_EL0(2), AMU_AMEVCNTR1_EL0(3), AMU_AMEVCNTR1_EL0(4), AMU_AMEVCNTR1_EL0(5), AMU_AMEVCNTR1_EL0(6), AMU_AMEVCNTR1_EL0(7), AMU_AMEVCNTR1_EL0(8), AMU_AMEVCNTR1_EL0(9), AMU_AMEVCNTR1_EL0(10), AMU_AMEVCNTR1_EL0(11), AMU_AMEVCNTR1_EL0(12), AMU_AMEVCNTR1_EL0(13), AMU_AMEVCNTR1_EL0(14), AMU_AMEVCNTR1_EL0(15), AMU_AMEVTYPER1_EL0(0), AMU_AMEVTYPER1_EL0(1), AMU_AMEVTYPER1_EL0(2), AMU_AMEVTYPER1_EL0(3), AMU_AMEVTYPER1_EL0(4), AMU_AMEVTYPER1_EL0(5), AMU_AMEVTYPER1_EL0(6), AMU_AMEVTYPER1_EL0(7), AMU_AMEVTYPER1_EL0(8), AMU_AMEVTYPER1_EL0(9), AMU_AMEVTYPER1_EL0(10), AMU_AMEVTYPER1_EL0(11), AMU_AMEVTYPER1_EL0(12), AMU_AMEVTYPER1_EL0(13), AMU_AMEVTYPER1_EL0(14), AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), PMU_PMEVCNTR_EL0(2), PMU_PMEVCNTR_EL0(3), PMU_PMEVCNTR_EL0(4), PMU_PMEVCNTR_EL0(5), PMU_PMEVCNTR_EL0(6), PMU_PMEVCNTR_EL0(7), PMU_PMEVCNTR_EL0(8), PMU_PMEVCNTR_EL0(9), PMU_PMEVCNTR_EL0(10), PMU_PMEVCNTR_EL0(11), PMU_PMEVCNTR_EL0(12), PMU_PMEVCNTR_EL0(13), PMU_PMEVCNTR_EL0(14), PMU_PMEVCNTR_EL0(15), PMU_PMEVCNTR_EL0(16), PMU_PMEVCNTR_EL0(17), PMU_PMEVCNTR_EL0(18), PMU_PMEVCNTR_EL0(19), PMU_PMEVCNTR_EL0(20), PMU_PMEVCNTR_EL0(21), PMU_PMEVCNTR_EL0(22), PMU_PMEVCNTR_EL0(23), PMU_PMEVCNTR_EL0(24), PMU_PMEVCNTR_EL0(25), PMU_PMEVCNTR_EL0(26), PMU_PMEVCNTR_EL0(27), PMU_PMEVCNTR_EL0(28), PMU_PMEVCNTR_EL0(29), PMU_PMEVCNTR_EL0(30), /* PMEVTYPERn_EL0 */ PMU_PMEVTYPER_EL0(0), PMU_PMEVTYPER_EL0(1), PMU_PMEVTYPER_EL0(2), PMU_PMEVTYPER_EL0(3), PMU_PMEVTYPER_EL0(4), PMU_PMEVTYPER_EL0(5), PMU_PMEVTYPER_EL0(6), PMU_PMEVTYPER_EL0(7), PMU_PMEVTYPER_EL0(8), PMU_PMEVTYPER_EL0(9), PMU_PMEVTYPER_EL0(10), PMU_PMEVTYPER_EL0(11), PMU_PMEVTYPER_EL0(12), PMU_PMEVTYPER_EL0(13), PMU_PMEVTYPER_EL0(14), PMU_PMEVTYPER_EL0(15), PMU_PMEVTYPER_EL0(16), PMU_PMEVTYPER_EL0(17), PMU_PMEVTYPER_EL0(18), PMU_PMEVTYPER_EL0(19), PMU_PMEVTYPER_EL0(20), PMU_PMEVTYPER_EL0(21), PMU_PMEVTYPER_EL0(22), PMU_PMEVTYPER_EL0(23), PMU_PMEVTYPER_EL0(24), PMU_PMEVTYPER_EL0(25), PMU_PMEVTYPER_EL0(26), PMU_PMEVTYPER_EL0(27), PMU_PMEVTYPER_EL0(28), PMU_PMEVTYPER_EL0(29), PMU_PMEVTYPER_EL0(30), /* * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper, .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0), EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, sve_el2_visibility), EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), EL2_REG_REDIR(SPSR_EL2, reset_val, 0), EL2_REG_REDIR(ELR_EL2, reset_val, 0), { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, s1pie_el2_visibility), EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, s1poe_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, { SYS_DESC(SYS_MPAM2_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_RMR_EL2), undef_access }, EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); __kvm_at_s1e01(vcpu, op, p->regval); return true; } static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); /* There is no FGT associated with AT S1E2A :-( */ if (op == OP_AT_S1E2A && !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { kvm_inject_undefined(vcpu); return false; } __kvm_at_s1e2(vcpu, op, p->regval); return true; } static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); __kvm_at_s12(vcpu, op, p->regval); return true; } static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); if (sys_reg_CRn(instr) == TLBI_CRn_nXS && !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) return false; if (CRm == TLBI_CRm_nROS && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; return true; } static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); write_lock(&vcpu->kvm->mmu_lock); /* * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the * corresponding VMIDs. */ kvm_nested_s2_unmap(vcpu->kvm, true); write_unlock(&vcpu->kvm->mmu_lock); return true; } static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; u8 CRm = sys_reg_CRm(instr); u8 Op2 = sys_reg_Op2(instr); if (sys_reg_CRn(instr) == TLBI_CRn_nXS && !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) return false; if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) return false; if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) && !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) return false; return true; } /* Only defined here as this is an internal "abstraction" */ union tlbi_info { struct { u64 start; u64 size; } range; struct { u64 addr; } ipa; struct { u64 addr; u32 encoding; } va; }; static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { /* * The unmap operation is allowed to drop the MMU lock and block, which * means that @mmu could be used for a different context than the one * currently being invalidated. * * This behavior is still safe, as: * * 1) The vCPU(s) that recycled the MMU are responsible for invalidating * the entire MMU before reusing it, which still honors the intent * of a TLBI. * * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC * and ERET to the guest), other vCPUs are allowed to use stale * translations. * * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and * at worst may cause more aborts for shadow stage-2 fills. * * Dropping the MMU lock also implies that shadow stage-2 fills could * happen behind the back of the TLBI. This is still safe, though, as * the L1 needs to put its stage-2 in a consistent state before doing * the TLBI. */ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); } static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 limit, vttbr; if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .range = { .start = 0, .size = limit, }, }, s2_mmu_unmap_range); return true; } static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); u64 base, range, tg, num, scale; int shift; if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); /* * Because the shadow S2 structure doesn't necessarily reflect that * of the guest's S2 (different base granule size, for example), we * decide to ignore TTL and only use the described range. */ tg = FIELD_GET(GENMASK(47, 46), p->regval); scale = FIELD_GET(GENMASK(45, 44), p->regval); num = FIELD_GET(GENMASK(43, 39), p->regval); base = p->regval & GENMASK(36, 0); switch(tg) { case 1: shift = 12; break; case 2: shift = 14; break; case 3: default: /* IMPDEF: handle tg==0 as 64k */ shift = 16; break; } base <<= shift; range = __TLBI_RANGE_PAGES(num, scale) << shift; kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .range = { .start = base, .size = range, }, }, s2_mmu_unmap_range); return true; } static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { unsigned long max_size; u64 base_addr; /* * We drop a number of things from the supplied value: * * - NS bit: we're non-secure only. * * - IPA[51:48]: We don't support 52bit IPA just yet... * * And of course, adjust the IPA to be on an actual address. */ base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12; max_size = compute_tlb_inval_range(mmu, info->ipa.addr); base_addr &= ~(max_size - 1); /* * See comment in s2_mmu_unmap_range() for why this is allowed to * reschedule. */ kvm_stage2_unmap_range(mmu, base_addr, max_size, true); } static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .ipa = { .addr = p->regval, }, }, s2_mmu_unmap_ipa); return true; } static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); } static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); /* * If we're here, this is because we've trapped on a EL1 TLBI * instruction that affects the EL1 translation regime while * we're running in a context that doesn't allow us to let the * HW do its thing (aka vEL2): * * - HCR_EL2.E2H == 0 : a non-VHE guest * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode * * We don't expect these helpers to ever be called when running * in a vEL1 context. */ WARN_ON(!vcpu_is_el2(vcpu)); if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { .va = { .addr = p->regval, .encoding = sys_encoding, }, }, s2_mmu_tlbi_s1e1); return true; } #define SYS_INSN(insn, access_fn) \ { \ SYS_DESC(OP_##insn), \ .access = (access_fn), \ } static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, SYS_INSN(AT_S1E1R, handle_at_s1e01), SYS_INSN(AT_S1E1W, handle_at_s1e01), SYS_INSN(AT_S1E0R, handle_at_s1e01), SYS_INSN(AT_S1E0W, handle_at_s1e01), SYS_INSN(AT_S1E1RP, handle_at_s1e01), SYS_INSN(AT_S1E1WP, handle_at_s1e01), { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, { SYS_DESC(SYS_DC_CISW), access_dcsw }, { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1), SYS_INSN(TLBI_VAE1, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1, handle_tlbi_el1), SYS_INSN(TLBI_VALE1, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), SYS_INSN(AT_S1E2R, handle_at_s1e2), SYS_INSN(AT_S1E2W, handle_at_s1e2), SYS_INSN(AT_S12E1R, handle_at_s12), SYS_INSN(AT_S12E1W, handle_at_s12), SYS_INSN(AT_S12E0R, handle_at_s12), SYS_INSN(AT_S12E0W, handle_at_s12), SYS_INSN(AT_S1E2A, handle_at_s1e2), SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), SYS_INSN(TLBI_ALLE2OS, undef_access), SYS_INSN(TLBI_VAE2OS, undef_access), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), SYS_INSN(TLBI_VALE2OS, undef_access), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), SYS_INSN(TLBI_RVAE2IS, undef_access), SYS_INSN(TLBI_RVALE2IS, undef_access), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), SYS_INSN(TLBI_RVAE2OS, undef_access), SYS_INSN(TLBI_RVALE2OS, undef_access), SYS_INSN(TLBI_RVAE2, undef_access), SYS_INSN(TLBI_RVALE2, undef_access), SYS_INSN(TLBI_ALLE1, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), SYS_INSN(TLBI_ALLE2OSNXS, undef_access), SYS_INSN(TLBI_VAE2OSNXS, undef_access), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), SYS_INSN(TLBI_VALE2OSNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), SYS_INSN(TLBI_RVAE2ISNXS, undef_access), SYS_INSN(TLBI_RVALE2ISNXS, undef_access), SYS_INSN(TLBI_ALLE2ISNXS, undef_access), SYS_INSN(TLBI_VAE2ISNXS, undef_access), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), SYS_INSN(TLBI_VALE2ISNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), SYS_INSN(TLBI_RVAE2OSNXS, undef_access), SYS_INSN(TLBI_RVALE2OSNXS, undef_access), SYS_INSN(TLBI_RVAE2NXS, undef_access), SYS_INSN(TLBI_RVALE2NXS, undef_access), SYS_INSN(TLBI_ALLE2NXS, undef_access), SYS_INSN(TLBI_VAE2NXS, undef_access), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), SYS_INSN(TLBI_VALE2NXS, undef_access), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { return ignore_write(vcpu, p); } else { u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1); u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) | (1 << 15) | (el3 << 14) | (el3 << 12)); return true; } } /* * AArch32 debug register mappings * * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0] * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32] * * None of the other registers share their location, so treat them as * if they were 64bit. */ #define DBG_BCR_BVR_WCR_WVR(n) \ /* DBGBVRn */ \ { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ trap_dbg_wb_reg, NULL, n }, \ /* DBGBCRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ /* DBGWVRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ /* DBGWCRn */ \ { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } #define DBGBXVR(n) \ { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external * debug, on the principle that they don't really make sense to a * guest. Revisit this one day, would this principle change. */ static const struct sys_reg_desc cp14_regs[] = { /* DBGDIDR */ { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr }, /* DBGDTRRXext */ { Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(0), /* DBGDSCRint */ { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(1), /* DBGDCCINT */ { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 }, /* DBGDSCRext */ { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 }, DBG_BCR_BVR_WCR_WVR(2), /* DBGDTR[RT]Xint */ { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, /* DBGDTR[RT]Xext */ { Op1( 0), CRn( 0), CRm( 3), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(3), DBG_BCR_BVR_WCR_WVR(4), DBG_BCR_BVR_WCR_WVR(5), /* DBGWFAR */ { Op1( 0), CRn( 0), CRm( 6), Op2( 0), trap_raz_wi }, /* DBGOSECCR */ { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, DBG_BCR_BVR_WCR_WVR(6), /* DBGVCR */ { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 }, DBG_BCR_BVR_WCR_WVR(7), DBG_BCR_BVR_WCR_WVR(8), DBG_BCR_BVR_WCR_WVR(9), DBG_BCR_BVR_WCR_WVR(10), DBG_BCR_BVR_WCR_WVR(11), DBG_BCR_BVR_WCR_WVR(12), DBG_BCR_BVR_WCR_WVR(13), DBG_BCR_BVR_WCR_WVR(14), DBG_BCR_BVR_WCR_WVR(15), /* DBGDRAR (32bit) */ { Op1( 0), CRn( 1), CRm( 0), Op2( 0), trap_raz_wi }, DBGBXVR(0), /* DBGOSLAR */ { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_oslar_el1 }, DBGBXVR(1), /* DBGOSLSR */ { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1, NULL, OSLSR_EL1 }, DBGBXVR(2), DBGBXVR(3), /* DBGOSDLR */ { Op1( 0), CRn( 1), CRm( 3), Op2( 4), trap_raz_wi }, DBGBXVR(4), /* DBGPRCR */ { Op1( 0), CRn( 1), CRm( 4), Op2( 4), trap_raz_wi }, DBGBXVR(5), DBGBXVR(6), DBGBXVR(7), DBGBXVR(8), DBGBXVR(9), DBGBXVR(10), DBGBXVR(11), DBGBXVR(12), DBGBXVR(13), DBGBXVR(14), DBGBXVR(15), /* DBGDSAR (32bit) */ { Op1( 0), CRn( 2), CRm( 0), Op2( 0), trap_raz_wi }, /* DBGDEVID2 */ { Op1( 0), CRn( 7), CRm( 0), Op2( 7), trap_raz_wi }, /* DBGDEVID1 */ { Op1( 0), CRn( 7), CRm( 1), Op2( 7), trap_raz_wi }, /* DBGDEVID */ { Op1( 0), CRn( 7), CRm( 2), Op2( 7), trap_raz_wi }, /* DBGCLAIMSET */ { Op1( 0), CRn( 7), CRm( 8), Op2( 6), trap_raz_wi }, /* DBGCLAIMCLR */ { Op1( 0), CRn( 7), CRm( 9), Op2( 6), trap_raz_wi }, /* DBGAUTHSTATUS */ { Op1( 0), CRn( 7), CRm(14), Op2( 6), trap_dbgauthstatus_el1 }, }; /* Trapped cp14 64bit registers */ static const struct sys_reg_desc cp14_64_regs[] = { /* DBGDRAR (64bit) */ { Op1( 0), CRm( 1), .access = trap_raz_wi }, /* DBGDSAR (64bit) */ { Op1( 0), CRm( 2), .access = trap_raz_wi }, }; #define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2) \ AA32(_map), \ Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2), \ .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn register */ #define PMU_PMEVCNTR(n) \ { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ .access = access_pmu_evcntr } /* Macro to expand the PMEVTYPERn register */ #define PMU_PMEVTYPER(n) \ { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110, \ (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)), \ .access = access_pmu_evtyper } /* * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, * depending on the way they are accessed (as a 32bit or a 64bit * register). */ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 }, /* ACTLR */ { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 }, /* ACTLR2 */ { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 }, /* TTBCR */ { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 }, /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, /* ADFSR */ { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 }, /* AIFSR */ { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 }, /* DFAR */ { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 }, /* IFAR */ { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 }, /* * DC{C,I,CI}SW operations: */ { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, /* PMU */ { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr }, { CP15_PMU_SYS_REG(LO, 0, 9, 12, 6), .access = access_pmceid }, { CP15_PMU_SYS_REG(LO, 0, 9, 12, 7), .access = access_pmceid }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten }, { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs }, { CP15_PMU_SYS_REG(HI, 0, 9, 14, 4), .access = access_pmceid }, { CP15_PMU_SYS_REG(HI, 0, 9, 14, 5), .access = access_pmceid }, /* PMMIR */ { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi }, /* PRRR/MAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 }, /* NMRR/MAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 }, /* AMAIR0 */ { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 }, /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, /* Arch Tmers */ { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, /* PMEVCNTRn */ PMU_PMEVCNTR(0), PMU_PMEVCNTR(1), PMU_PMEVCNTR(2), PMU_PMEVCNTR(3), PMU_PMEVCNTR(4), PMU_PMEVCNTR(5), PMU_PMEVCNTR(6), PMU_PMEVCNTR(7), PMU_PMEVCNTR(8), PMU_PMEVCNTR(9), PMU_PMEVCNTR(10), PMU_PMEVCNTR(11), PMU_PMEVCNTR(12), PMU_PMEVCNTR(13), PMU_PMEVCNTR(14), PMU_PMEVCNTR(15), PMU_PMEVCNTR(16), PMU_PMEVCNTR(17), PMU_PMEVCNTR(18), PMU_PMEVCNTR(19), PMU_PMEVCNTR(20), PMU_PMEVCNTR(21), PMU_PMEVCNTR(22), PMU_PMEVCNTR(23), PMU_PMEVCNTR(24), PMU_PMEVCNTR(25), PMU_PMEVCNTR(26), PMU_PMEVCNTR(27), PMU_PMEVCNTR(28), PMU_PMEVCNTR(29), PMU_PMEVCNTR(30), /* PMEVTYPERn */ PMU_PMEVTYPER(0), PMU_PMEVTYPER(1), PMU_PMEVTYPER(2), PMU_PMEVTYPER(3), PMU_PMEVTYPER(4), PMU_PMEVTYPER(5), PMU_PMEVTYPER(6), PMU_PMEVTYPER(7), PMU_PMEVTYPER(8), PMU_PMEVTYPER(9), PMU_PMEVTYPER(10), PMU_PMEVTYPER(11), PMU_PMEVTYPER(12), PMU_PMEVTYPER(13), PMU_PMEVTYPER(14), PMU_PMEVTYPER(15), PMU_PMEVTYPER(16), PMU_PMEVTYPER(17), PMU_PMEVTYPER(18), PMU_PMEVTYPER(19), PMU_PMEVTYPER(20), PMU_PMEVTYPER(21), PMU_PMEVTYPER(22), PMU_PMEVTYPER(23), PMU_PMEVTYPER(24), PMU_PMEVTYPER(25), PMU_PMEVTYPER(26), PMU_PMEVTYPER(27), PMU_PMEVTYPER(28), PMU_PMEVTYPER(29), PMU_PMEVTYPER(30), /* PMCCFILTR */ { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper }, { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, /* CCSIDR2 */ { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access }, { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 }, }; static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 }, { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr }, { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */ { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, bool is_32) { unsigned int i; for (i = 0; i < n; i++) { if (!is_32 && table[i].reg && !table[i].reset) { kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n", &table[i], i, table[i].name); return false; } if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) { kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n", &table[i], i, table[i - 1].name, table[i].name); return false; } } return true; } int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; } static void perform_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); /* Check for regs disabled by runtime config */ if (sysreg_hidden(vcpu, r)) { kvm_inject_undefined(vcpu); return; } /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies * as a gross bug that should be fixed right away. */ BUG_ON(!r->access); /* Skip instruction if instructed so */ if (likely(r->access(vcpu, params, r))) kvm_incr_pc(vcpu); } /* * emulate_cp -- tries to match a sys_reg access in a handling table, and * call the corresponding trap handler. * * @params: pointer to the descriptor of the access * @table: array of trap descriptors * @num: size of the trap descriptor array * * Return true if the access has been handled, false if not. */ static bool emulate_cp(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *table, size_t num) { const struct sys_reg_desc *r; if (!table) return false; /* Not handled */ r = find_reg(params, table, num); if (r) { perform_access(vcpu, params, r); return true; } /* Not handled */ return false; } static void unhandled_cp_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); int cp = -1; switch (esr_ec) { case ESR_ELx_EC_CP15_32: case ESR_ELx_EC_CP15_64: cp = 15; break; case ESR_ELx_EC_CP14_MR: case ESR_ELx_EC_CP14_64: cp = 14; break; default: WARN_ON(1); } print_sys_reg_msg(params, "Unsupported guest CP%d access at: %08lx [%08lx]\n", cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); } /** * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @global: &struct sys_reg_desc * @nr_global: size of the @global array */ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, const struct sys_reg_desc *global, size_t nr_global) { struct sys_reg_params params; u64 esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int Rt2 = (esr >> 10) & 0x1f; params.CRm = (esr >> 1) & 0xf; params.is_write = ((esr & 1) == 0); params.Op0 = 0; params.Op1 = (esr >> 16) & 0xf; params.Op2 = 0; params.CRn = 0; /* * Make a 64-bit value out of Rt and Rt2. As we use the same trap * backends between AArch32 and AArch64, we get away with it. */ if (params.is_write) { params.regval = vcpu_get_reg(vcpu, Rt) & 0xffffffff; params.regval |= vcpu_get_reg(vcpu, Rt2) << 32; } /* * If the table contains a handler, handle the * potential register operation in the case of a read and return * with success. */ if (emulate_cp(vcpu, ¶ms, global, nr_global)) { /* Split up the value between registers for the read side */ if (!params.is_write) { vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval)); } return 1; } unhandled_cp_access(vcpu, ¶ms); return 1; } static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params); /* * The CP10 ID registers are architecturally mapped to AArch64 feature * registers. Abuse that fact so we can rely on the AArch64 handler for accesses * from AArch32. */ static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params) { u8 reg_id = (esr >> 10) & 0xf; bool valid; params->is_write = ((esr & 1) == 0); params->Op0 = 3; params->Op1 = 0; params->CRn = 0; params->CRm = 3; /* CP10 ID registers are read-only */ valid = !params->is_write; switch (reg_id) { /* MVFR0 */ case 0b0111: params->Op2 = 0; break; /* MVFR1 */ case 0b0110: params->Op2 = 1; break; /* MVFR2 */ case 0b0101: params->Op2 = 2; break; default: valid = false; } if (valid) return true; kvm_pr_unimpl("Unhandled cp10 register %s: %u\n", params->is_write ? "write" : "read", reg_id); return false; } /** * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and * VFP Register' from AArch32. * @vcpu: The vCPU pointer * * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers. * Work out the correct AArch64 system register encoding and reroute to the * AArch64 system register emulation. */ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu) { int Rt = kvm_vcpu_sys_get_rt(vcpu); u64 esr = kvm_vcpu_get_esr(vcpu); struct sys_reg_params params; /* UNDEF on any unhandled register access */ if (!kvm_esr_cp10_id_to_sys64(esr, ¶ms)) { kvm_inject_undefined(vcpu); return 1; } if (emulate_sys_reg(vcpu, ¶ms)) vcpu_set_reg(vcpu, Rt, params.regval); return 1; } /** * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where * CRn=0, which corresponds to the AArch32 feature * registers. * @vcpu: the vCPU pointer * @params: the system register access parameters. * * Our cp15 system register tables do not enumerate the AArch32 feature * registers. Conveniently, our AArch64 table does, and the AArch32 system * register encoding can be trivially remapped into the AArch64 for the feature * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same. * * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit * System registers with (coproc=0b1111, CRn==c0)", read accesses from this * range are either UNKNOWN or RES0. Rerouting remains architectural as we * treat undefined registers in this range as RAZ. */ static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { int Rt = kvm_vcpu_sys_get_rt(vcpu); /* Treat impossible writes to RO registers as UNDEFINED */ if (params->is_write) { unhandled_cp_access(vcpu, params); return 1; } params->Op0 = 3; /* * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32. * Avoid conflicting with future expansion of AArch64 feature registers * and simply treat them as RAZ here. */ if (params->CRm > 3) params->regval = 0; else if (!emulate_sys_reg(vcpu, params)) return 1; vcpu_set_reg(vcpu, Rt, params->regval); return 1; } /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer * @params: &struct sys_reg_params * @global: &struct sys_reg_desc * @nr_global: size of the @global array */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *global, size_t nr_global) { int Rt = kvm_vcpu_sys_get_rt(vcpu); params->regval = vcpu_get_reg(vcpu, Rt); if (emulate_cp(vcpu, params, global, nr_global)) { if (!params->is_write) vcpu_set_reg(vcpu, Rt, params->regval); return 1; } unhandled_cp_access(vcpu, params); return 1; } int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp15_64_regs, ARRAY_SIZE(cp15_64_regs)); } int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { struct sys_reg_params params; params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); /* * Certain AArch32 ID registers are handled by rerouting to the AArch64 * system register table. Registers in the ID range where CRm=0 are * excluded from this scheme as they do not trivially map into AArch64 * system register encodings. */ if (params.Op1 == 0 && params.CRn == 0 && params.CRm) return kvm_emulate_cp15_id_reg(vcpu, ¶ms); return kvm_handle_cp_32(vcpu, ¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); } int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs)); } int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { struct sys_reg_params params; params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu)); return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } /** * emulate_sys_reg - Emulate a guest access to an AArch64 system register * @vcpu: The VCPU pointer * @params: Decoded system register parameters * * Return: true if the system register access was successful, false otherwise. */ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { const struct sys_reg_desc *r; r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); if (likely(r)) { perform_access(vcpu, params, r); return true; } print_sys_reg_msg(params, "Unsupported guest sys_reg access at: %lx [%08lx]\n", *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); kvm_inject_undefined(vcpu); return false; } static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos) { unsigned long i, idreg_idx = 0; for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; if (!is_vm_ftr_id_reg(reg_to_encoding(r))) continue; if (idreg_idx == pos) return r; idreg_idx++; } return NULL; } static void *idregs_debug_start(struct seq_file *s, loff_t *pos) { struct kvm *kvm = s->private; u8 *iter; mutex_lock(&kvm->arch.config_lock); iter = &kvm->arch.idreg_debugfs_iter; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && *iter == (u8)~0) { *iter = *pos; if (!idregs_debug_find(kvm, *iter)) iter = NULL; } else { iter = ERR_PTR(-EBUSY); } mutex_unlock(&kvm->arch.config_lock); return iter; } static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) { struct kvm *kvm = s->private; (*pos)++; if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) { kvm->arch.idreg_debugfs_iter++; return &kvm->arch.idreg_debugfs_iter; } return NULL; } static void idregs_debug_stop(struct seq_file *s, void *v) { struct kvm *kvm = s->private; if (IS_ERR(v)) return; mutex_lock(&kvm->arch.config_lock); kvm->arch.idreg_debugfs_iter = ~0; mutex_unlock(&kvm->arch.config_lock); } static int idregs_debug_show(struct seq_file *s, void *v) { const struct sys_reg_desc *desc; struct kvm *kvm = s->private; desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter); if (!desc->name) return 0; seq_printf(s, "%20s:\t%016llx\n", desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc))); return 0; } static const struct seq_operations idregs_debug_sops = { .start = idregs_debug_start, .next = idregs_debug_next, .stop = idregs_debug_stop, .show = idregs_debug_show, }; DEFINE_SEQ_ATTRIBUTE(idregs_debug); void kvm_sys_regs_create_debugfs(struct kvm *kvm) { kvm->arch.idreg_debugfs_iter = ~0; debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, &idregs_debug_fops); } static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { u32 id = reg_to_encoding(reg); struct kvm *kvm = vcpu->kvm; if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) return; kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg)); } static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg) { if (kvm_vcpu_initialized(vcpu)) return; reg->reset(vcpu, reg); } /** * kvm_reset_sys_regs - sets system registers to reset value * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on the * virtual CPU struct to their architecturally defined reset values. */ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long i; for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *r = &sys_reg_descs[i]; if (!r->reset) continue; if (is_vm_ftr_id_reg(reg_to_encoding(r))) reset_vm_ftr_id_reg(vcpu, r); else if (is_vcpu_ftr_id_reg(reg_to_encoding(r))) reset_vcpu_ftr_id_reg(vcpu, r); else r->reset(vcpu, r); if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) (void)__vcpu_sys_reg(vcpu, r->reg); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); } /** * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction * trap on a guest execution * @vcpu: The VCPU pointer */ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *desc = NULL; struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); int sr_idx; trace_kvm_handle_sys_reg(esr); if (triage_sysreg_trap(vcpu, &sr_idx)) return 1; params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ if (params.Op0 == 2 || params.Op0 == 3) desc = &sys_reg_descs[sr_idx]; else desc = &sys_insn_descs[sr_idx]; perform_access(vcpu, ¶ms, desc); /* Read from system register? */ if (!params.is_write && (params.Op0 == 2 || params.Op0 == 3)) vcpu_set_reg(vcpu, Rt, params.regval); return 1; } /****************************************************************************** * Userspace API *****************************************************************************/ static bool index_to_params(u64 id, struct sys_reg_params *params) { switch (id & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U64: /* Any unused index bits means it's not valid. */ if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK | KVM_REG_ARM64_SYSREG_OP0_MASK | KVM_REG_ARM64_SYSREG_OP1_MASK | KVM_REG_ARM64_SYSREG_CRN_MASK | KVM_REG_ARM64_SYSREG_CRM_MASK | KVM_REG_ARM64_SYSREG_OP2_MASK)) return false; params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); return true; default: return false; } } const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num) { struct sys_reg_params params; if (!index_to_params(id, ¶ms)) return NULL; return find_reg(¶ms, table, num); } /* Decode an index value, and find the sys_reg_desc entry. */ static const struct sys_reg_desc * id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, const struct sys_reg_desc table[], unsigned int num) { const struct sys_reg_desc *r; /* We only do sys_reg for now. */ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; r = get_reg_by_id(id, table, num); /* Not saved in the sys_reg array and not otherwise accessible? */ if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) r = NULL; return r; } /* * These are the invariant sys_reg registers: we let the guest see the * host versions of these, so they're part of the guest state. * * A future CPU may provide a mechanism to present different values to * the guest, or a future kvm may trap them. */ #define FUNCTION_INVARIANT(reg) \ static u64 reset_##reg(struct kvm_vcpu *v, \ const struct sys_reg_desc *r) \ { \ ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ return ((struct sys_reg_desc *)r)->val; \ } FUNCTION_INVARIANT(midr_el1) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(aidr_el1) /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { { SYS_DESC(SYS_MIDR_EL1), NULL, reset_midr_el1 }, { SYS_DESC(SYS_REVIDR_EL1), NULL, reset_revidr_el1 }, { SYS_DESC(SYS_AIDR_EL1), NULL, reset_aidr_el1 }, }; static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) { const struct sys_reg_desc *r; r = get_reg_by_id(id, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; return put_user(r->val, uaddr); } static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) { const struct sys_reg_desc *r; u64 val; r = get_reg_by_id(id, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; if (get_user(val, uaddr)) return -EFAULT; /* This is what we mean by invariant: you can't change it. */ if (r->val != val) return -EINVAL; return 0; } static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val; u32 __user *uval = uaddr; /* Fail if we have unknown bits set. */ if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) return -ENOENT; switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { case KVM_REG_ARM_DEMUX_ID_CCSIDR: if (KVM_REG_SIZE(id) != 4) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; if (val >= CSSELR_MAX) return -ENOENT; return put_user(get_ccsidr(vcpu, val), uval); default: return -ENOENT; } } static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) { u32 val, newval; u32 __user *uval = uaddr; /* Fail if we have unknown bits set. */ if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) return -ENOENT; switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { case KVM_REG_ARM_DEMUX_ID_CCSIDR: if (KVM_REG_SIZE(id) != 4) return -ENOENT; val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) >> KVM_REG_ARM_DEMUX_VAL_SHIFT; if (val >= CSSELR_MAX) return -ENOENT; if (get_user(newval, uval)) return -EFAULT; return set_ccsidr(vcpu, val, newval); default: return -ENOENT; } } int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; u64 val; int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { ret = (r->get_user)(vcpu, r, &val); } else { val = __vcpu_sys_reg(vcpu, r->reg); ret = 0; } if (!ret) ret = put_user(val, uaddr); return ret; } int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(vcpu, reg->id, uaddr); err = get_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) return err; return kvm_sys_reg_get_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; u64 val; int ret; if (get_user(val, uaddr)) return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) return 0; if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { __vcpu_sys_reg(vcpu, r->reg) = val; ret = 0; } return ret; } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { void __user *uaddr = (void __user *)(unsigned long)reg->addr; int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(vcpu, reg->id, uaddr); err = set_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) return err; return kvm_sys_reg_set_user(vcpu, reg, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } static unsigned int num_demux_regs(void) { return CSSELR_MAX; } static int write_demux_regids(u64 __user *uindices) { u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; unsigned int i; val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; for (i = 0; i < CSSELR_MAX; i++) { if (put_user(val | i, uindices)) return -EFAULT; uindices++; } return 0; } static u64 sys_reg_to_index(const struct sys_reg_desc *reg) { return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | (reg->Op0 << KVM_REG_ARM64_SYSREG_OP0_SHIFT) | (reg->Op1 << KVM_REG_ARM64_SYSREG_OP1_SHIFT) | (reg->CRn << KVM_REG_ARM64_SYSREG_CRN_SHIFT) | (reg->CRm << KVM_REG_ARM64_SYSREG_CRM_SHIFT) | (reg->Op2 << KVM_REG_ARM64_SYSREG_OP2_SHIFT)); } static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) { if (!*uind) return true; if (put_user(sys_reg_to_index(reg), *uind)) return false; (*uind)++; return true; } static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { /* * Ignore registers we trap but don't save, * and for which no custom user accessor is provided. */ if (!(rd->reg || rd->get_user)) return 0; if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) return -EFAULT; (*total)++; return 0; } /* Assumed ordered tables, see kvm_sys_reg_table_init. */ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) { const struct sys_reg_desc *i2, *end2; unsigned int total = 0; int err; i2 = sys_reg_descs; end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs); while (i2 != end2) { err = walk_one_sys_reg(vcpu, i2++, &uind, &total); if (err) return err; } return total; } unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { return ARRAY_SIZE(invariant_sys_regs) + num_demux_regs() + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { unsigned int i; int err; /* Then give them all the invariant registers' indices. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) return -EFAULT; uindices++; } err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; uindices += err; return write_demux_regids(uindices); } #define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \ KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r), \ sys_reg_Op1(r), \ sys_reg_CRn(r), \ sys_reg_CRm(r), \ sys_reg_Op2(r)) int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range) { const void *zero_page = page_to_virt(ZERO_PAGE(0)); u64 __user *masks = (u64 __user *)range->addr; /* Only feature id range is supported, reserved[13] must be zero. */ if (range->range || memcmp(range->reserved, zero_page, sizeof(range->reserved))) return -EINVAL; /* Wipe the whole thing first */ if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64))) return -EFAULT; for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { const struct sys_reg_desc *reg = &sys_reg_descs[i]; u32 encoding = reg_to_encoding(reg); u64 val; if (!is_feature_id_reg(encoding) || !reg->set_user) continue; if (!reg->val || (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) { continue; } val = reg->val; if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding)))) return -EFAULT; } return 0; } static void vcpu_set_hcr(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (has_vhe() || has_hvhe()) vcpu->arch.hcr_el2 |= HCR_E2H; if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { /* route synchronous external abort exceptions to EL2 */ vcpu->arch.hcr_el2 |= HCR_TEA; /* trap error record accesses */ vcpu->arch.hcr_el2 |= HCR_TERR; } if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) vcpu->arch.hcr_el2 |= HCR_FWB; if (cpus_have_final_cap(ARM64_HAS_EVT) && !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0)) vcpu->arch.hcr_el2 |= HCR_TID4; else vcpu->arch.hcr_el2 |= HCR_TID2; if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; if (kvm_has_mte(vcpu->kvm)) vcpu->arch.hcr_el2 |= HCR_ATA; /* * In the absence of FGT, we cannot independently trap TLBI * Range instructions. This isn't great, but trapping all * TLBIs would be far worse. Live with it... */ if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) vcpu->arch.hcr_el2 |= HCR_TTLBOS; } void kvm_calculate_traps(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); vcpu_set_ich_hcr(vcpu); if (cpus_have_final_cap(ARM64_HAS_HCX)) { /* * In general, all HCRX_EL2 bits are gated by a feature. * The only reason we can set SMPME without checking any * feature is that its effects are not directly observable * from the guest. */ vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME; if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); if (kvm_has_tcr2(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; if (kvm_has_fpmr(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; } if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) goto out; kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | HFGxTR_EL2_nACCDATA_EL1 | HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK); if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1OS| HFGITR_EL2_TLBIRVALE1OS | HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS | HFGITR_EL2_TLBIVAALE1OS | HFGITR_EL2_TLBIVALE1OS | HFGITR_EL2_TLBIVAAE1OS | HFGITR_EL2_TLBIASIDE1OS | HFGITR_EL2_TLBIVAE1OS | HFGITR_EL2_TLBIVMALLE1OS); if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1 | HFGITR_EL2_TLBIRVALE1 | HFGITR_EL2_TLBIRVAAE1 | HFGITR_EL2_TLBIRVAE1 | HFGITR_EL2_TLBIRVAALE1IS| HFGITR_EL2_TLBIRVALE1IS | HFGITR_EL2_TLBIRVAAE1IS | HFGITR_EL2_TLBIRVAE1IS | HFGITR_EL2_TLBIRVAALE1OS| HFGITR_EL2_TLBIRVALE1OS | HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); if (!kvm_has_s1pie(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); if (!kvm_has_s1poe(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) { kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA | HDFGRTR_EL2_nBRBCTL | HDFGRTR_EL2_nBRBIDR); kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); } set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock); } /* * Perform last adjustments to the ID registers that are implied by the * configuration outside of the ID regs themselves, as well as any * initialisation that directly depend on these ID registers (such as * RES0/RES1 behaviours). This is not the place to configure traps though. * * Because this can be called once per CPU, changes must be idempotent. */ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; guard(mutex)(&kvm->arch.config_lock); if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && irqchip_in_kernel(kvm) && kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; } if (vcpu_has_nv(vcpu)) { int ret = kvm_init_nv_sysregs(vcpu); if (ret) return ret; } return 0; } int __init kvm_sys_reg_table_init(void) { bool valid = true; unsigned int i; int ret = 0; /* Make sure tables are unique and in order. */ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true); valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true); valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); if (!valid) return -EINVAL; /* We abuse the reset function to overwrite the table itself. */ for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); ret = populate_nv_trap_config(); for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) ret = populate_sysreg_config(sys_reg_descs + i, i); for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) ret = populate_sysreg_config(sys_insn_descs + i, i); return ret; } |
394 277 134 277 276 1 276 275 2 2 398 4 135 278 277 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | // SPDX-License-Identifier: GPL-2.0-only /* * This implements the various checks for CONFIG_HARDENED_USERCOPY*, * which are designed to protect kernel memory from needless exposure * and overwrite under many unintended conditions. This code is based * on PAX_USERCOPY, which is: * * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source * Security Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/highmem.h> #include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> #include "slab.h" /* * Checks if a given pointer and length is contained by the current * stack frame (if possible). * * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) { const void * const stack = task_stack_page(current); const void * const stackend = stack + THREAD_SIZE; int ret; /* Object is not on the stack at all. */ if (obj + len <= stack || stackend <= obj) return NOT_STACK; /* * Reject: object partially overlaps the stack (passing the * check above means at least one end is within the stack, * so if this check fails, the other end is outside the stack). */ if (obj < stack || stackend < obj + len) return BAD_STACK; /* Check if object is safely within a valid frame. */ ret = arch_within_stack_frames(stack, stackend, obj, len); if (ret) return ret; /* Finally, check stack depth if possible. */ #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { if ((void *)current_stack_pointer < obj + len) return BAD_STACK; } else { if (obj < (void *)current_stack_pointer) return BAD_STACK; } #endif return GOOD_STACK; } /* * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found * an unexpected state during a copy_from_user() or copy_to_user() call. * There are several checks being performed on the buffer by the * __check_object_size() function. Normal stack buffer usage should never * trip the checks, and kernel text addressing will always trip the check. * For cache objects, it is checking that only the whitelisted range of * bytes for a given cache is being accessed (via the cache's usersize and * useroffset fields). To adjust a cache whitelist, use the usercopy-aware * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) { pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", to_user ? "exposure" : "overwrite", to_user ? "from" : "to", name ? : "unknown?!", detail ? " '" : "", detail ? : "", detail ? "'" : "", offset, len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch * Oops code, so that is used here instead. */ BUG(); } /* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ static bool overlaps(const unsigned long ptr, unsigned long n, unsigned long low, unsigned long high) { const unsigned long check_low = ptr; unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ if (check_low >= high || check_high <= low) return false; return true; } /* Is this address range in the kernel text area? */ static inline void check_kernel_text_object(const unsigned long ptr, unsigned long n, bool to_user) { unsigned long textlow = (unsigned long)_stext; unsigned long texthigh = (unsigned long)_etext; unsigned long textlow_linear, texthigh_linear; if (overlaps(ptr, n, textlow, texthigh)) usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n); /* * Some architectures have virtual memory mappings with a secondary * mapping of the kernel text, i.e. there is more than one virtual * kernel address that points to the kernel image. It is usually * when there is a separate linear physical memory mapping, in that * __pa() is not just the reverse of __va(). This can be detected * and checked: */ textlow_linear = (unsigned long)lm_alias(textlow); /* No different mapping: we're done. */ if (textlow_linear == textlow) return; /* Check the secondary mapping... */ texthigh_linear = (unsigned long)lm_alias(texthigh); if (overlaps(ptr, n, textlow_linear, texthigh_linear)) usercopy_abort("linear kernel text", NULL, to_user, ptr - textlow_linear, n); } static inline void check_bogus_address(const unsigned long ptr, unsigned long n, bool to_user) { /* Reject if object wraps past end of memory. */ if (ptr + (n - 1) < ptr) usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n); /* Reject if NULL or ZERO-allocation. */ if (ZERO_OR_NULL_PTR(ptr)) usercopy_abort("null address", NULL, to_user, ptr, n); } static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { unsigned long addr = (unsigned long)ptr; unsigned long offset; struct folio *folio; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); if (n > PAGE_SIZE - offset) usercopy_abort("kmap", NULL, to_user, offset, n); return; } if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { struct vmap_area *area = find_vmap_area(addr); if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); if (n > area->va_end - addr) { offset = addr - area->va_start; usercopy_abort("vmalloc", NULL, to_user, offset, n); } return; } if (!virt_addr_valid(ptr)) return; folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); } else if (folio_test_large(folio)) { offset = ptr - folio_address(folio); if (n > folio_size(folio) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } } static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address * - fully contained by stack (or stack frame, when available) * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { if (static_branch_unlikely(&bypass_usercopy_checks)) return; /* Skip all tests if size is zero. */ if (!n) return; /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: /* Object is not touching the current process stack. */ break; case GOOD_FRAME: case GOOD_STACK: /* * Object is either in the correct frame (when it * is possible to check) or just generally on the * process stack (when frame checking not available). */ return; default: usercopy_abort("process stack", NULL, to_user, #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER IS_ENABLED(CONFIG_STACK_GROWSUP) ? ptr - (void *)current_stack_pointer : (void *)current_stack_pointer - ptr, #else 0, #endif n); } /* Check for bad heap object. */ check_heap_object(ptr, n, to_user); /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; } __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { if (enable_checks == false) static_branch_enable(&bypass_usercopy_checks); return 1; } late_initcall(set_hardened_usercopy); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 | // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif |
1 1 1 18 18 61 51 10 10 10 10 2 1 1 1 1 40 48 27 2 28 28 1 27 27 7 3 2 2 1 12 11 2 1 9 7 7 7 7 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/irqchip/arm-gic-v3.h> #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_asm.h> #include "vgic.h" static bool group0_trap; static bool group1_trap; static bool common_trap; static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; cpuif->vgic_hcr |= ICH_HCR_UIE; } static bool lr_signals_eoi_mi(u64 lr_val) { return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && !(lr_val & ICH_LR_HW); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); cpuif->vgic_hcr &= ~ICH_HCR_UIE; for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { intid = val & ICH_LR_VIRTUAL_ID_MASK; } else { intid = val & GICH_LR_VIRTUALID; is_v2_sgi = vgic_irq_is_sgi(intid); } /* Notify fds when the guest EOI'ed a level-triggered IRQ */ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) kvm_notify_acked_irq(vcpu->kvm, 0, intid - VGIC_NR_PRIVATE_IRQS); irq = vgic_get_vcpu_irq(vcpu, intid); if (!irq) /* An LPI could have been unmapped. */ continue; raw_spin_lock(&irq->irq_lock); /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) irq->active_source = cpuid; /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && (val & ICH_LR_PENDING_BIT)) { irq->pending_latch = true; if (is_v2_sgi) irq->source |= (1 << cpuid); } /* * Clear soft pending state when level irqs have been acked. */ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } cpuif->used_lrs = 0; } /* Requires the irq to be locked already */ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); if (irq->active) { val |= ICH_LR_ACTIVE_BIT; if (is_v2_sgi) val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; if (vgic_irq_is_multi_sgi(irq)) { allow_pending = false; val |= ICH_LR_EOI; } } if (irq->hw && !vgic_irq_needs_resampling(irq)) { val |= ICH_LR_HW; val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; /* * Never set pending+active on a HW interrupt, as the * pending state is kept at the physical distributor * level. */ if (irq->active) allow_pending = false; } else { if (irq->config == VGIC_CONFIG_LEVEL) { val |= ICH_LR_EOI; /* * Software resampling doesn't work very well * if we allow P+A, so let's not do that. */ if (irq->active) allow_pending = false; } } if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", irq->intid)) return; val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; irq->source &= ~(1 << (src - 1)); if (irq->source) { irq->pending_latch = true; val |= ICH_LR_EOI; } } } /* * Level-triggered mapped IRQs are special because we only observe * rising edges as input to the VGIC. We therefore lower the line * level here, so that we can take new virtual IRQs. See * vgic_v3_fold_lr_state for more info. */ if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; if (irq->group) val |= ICH_LR_GROUP; val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) { vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; } void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & ICH_VMCR_ACK_CTL_MASK; vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & ICH_VMCR_FIQ_EN_MASK; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcr = ICH_VMCR_FIQ_EN_MASK; } vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; cpu_if->vgic_vmcr = vmcr; } void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; vmcr = cpu_if->vgic_vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> ICH_VMCR_ACK_CTL_SHIFT; vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> ICH_VMCR_FIQ_EN_SHIFT; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcrp->fiqen = 1; vmcrp->ackctl = 0; } vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; } #define INITIAL_PENDBASER_VALUE \ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) void vgic_v3_enable(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* * By forcing VMCR to zero, the GIC will restore the binary * points to their reset values. Anything else resets to zero * anyway. */ vgic_v3->vgic_vmcr = 0; /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible * way, so we force SRE to 1 to demonstrate this to the guest. * Also, we don't support any form of IRQ/FIQ bypass. * This goes with the spec allowing the value to be RAO/WI. */ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE); vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; } else { vgic_v3->vgic_sre = 0; } vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_ID_BITS_MASK) >> ICH_VTR_ID_BITS_SHIFT; vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_PRI_BITS_MASK) >> ICH_VTR_PRI_BITS_SHIFT) + 1; /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* Hide GICv3 sysreg if necessary */ if (!kvm_has_gicv3(vcpu->kvm)) { vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; return; } if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL0; if (group1_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_TC; if (dir_trap) vgic_v3->vgic_hcr |= ICH_HCR_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) { struct kvm_vcpu *vcpu; int byte_offset, bit_nr; gpa_t pendbase, ptr; bool status; u8 val; int ret; unsigned long flags; retry: vcpu = irq->target_vcpu; if (!vcpu) return 0; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; status = val & (1 << bit_nr); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu != vcpu) { raw_spin_unlock_irqrestore(&irq->irq_lock, flags); goto retry; } irq->pending_latch = status; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } return 0; } /* * The deactivation of the doorbell interrupt will trigger the * unmapping of the associated vPE. */ static void unmap_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i)); } static void map_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i), dist->its_vm.vpes[i]->irq)); } /* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ int vgic_v3_save_pending_tables(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; bool vlpi_avail = false; unsigned long index; int ret = 0; u8 val; if (unlikely(!vgic_initialized(kvm))) return -ENXIO; /* * A preparation for getting any VLPI states. * The above vgic initialized check also ensures that the allocation * and enabling of the doorbells have already been done. */ if (kvm_vgic_global_state.has_gicv4_1) { unmap_all_vpes(kvm); vlpi_avail = true; } xa_for_each(&dist->lpi_xa, index, irq) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; bool is_pending; bool stored; vcpu = irq->target_vcpu; if (!vcpu) continue; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; last_ptr = ptr; } stored = val & (1U << bit_nr); is_pending = irq->pending_latch; if (irq->hw && vlpi_avail) vgic_v4_get_vlpi_state(irq, &is_pending); if (stored == is_pending) continue; if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } out: if (vlpi_avail) map_all_vpes(kvm); return ret; } /** * vgic_v3_rdist_overlap - check if a region overlaps with any * existing redistributor region * * @kvm: kvm handle * @base: base of the region * @size: size of region * * Return: true if there is an overlap */ bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, &d->rd_regions, list) { if ((base + size > rdreg->base) && (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) return true; } return false; } /* * Check for overlapping regions and for regions crossing the end of memory * for base addresses which have already been set. */ bool vgic_v3_check_base(struct kvm *kvm) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { size_t sz = vgic_v3_rd_region_size(kvm, rdreg); if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, rdreg->base, SZ_64K, sz)) return false; } if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) return true; return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, KVM_VGIC_V3_DIST_SIZE); } /** * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one * which has free space to put a new rdist region. * * @rd_regions: redistributor region list head * * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). * Stride between redistributors is 0 and regions are filled in the index order. * * Return: the redist region handle, if any, that has space to map a new rdist * region. */ struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) { struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (!vgic_v3_redist_region_full(rdreg)) return rdreg; } return NULL; } struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index) { struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (rdreg->index == index) return rdreg; } return NULL; } int vgic_v3_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; unsigned long c; kvm_for_each_vcpu(c, vcpu, kvm) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { kvm_debug("vcpu %ld redistributor base not set\n", c); return -ENXIO; } } if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { kvm_debug("Need to set vgic distributor addresses first\n"); return -ENXIO; } if (!vgic_v3_check_base(kvm)) { kvm_debug("VGIC redist and dist frames overlap\n"); return -EINVAL; } /* * For a VGICv3 we require the userland to explicitly initialize * the VGIC before we need to use it. */ if (!vgic_initialized(kvm)) { return -EBUSY; } if (kvm_vgic_global_state.has_gicv4_1) vgic_v4_configure_vsgis(kvm); return 0; } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); static int __init early_group0_trap_cfg(char *buf) { return kstrtobool(buf, &group0_trap); } early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); static int __init early_group1_trap_cfg(char *buf) { return kstrtobool(buf, &group1_trap); } early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); static int __init early_common_trap_cfg(char *buf) { return kstrtobool(buf, &common_trap); } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); static int __init early_gicv4_enable(char *buf) { return kstrtobool(buf, &gicv4_enable); } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; static bool vgic_v3_broken_seis(void) { return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) && is_midr_in_range_list(read_cpuid_id(), broken_seis)); } /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description * * Returns 0 if the VGICv3 has been probed successfully, returns an error code * otherwise */ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; int ret; has_v2 = ich_vtr_el2 >> 63; ich_vtr_el2 = (u32)ich_vtr_el2; /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... */ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; kvm_vgic_global_state.can_emulate_gicv2 = false; kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; /* GICv4 support? */ if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); } else if (!has_v2) { pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); } else if (kvm_get_mode() != KVM_MODE_PROTECTED) { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); if (ret) { kvm_err("Cannot register GICv2 KVM device.\n"); return ret; } kvm_info("vgic-v2@%llx\n", info->vcpu.start); } ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); if (ret) { kvm_err("Cannot register GICv3 KVM device.\n"); kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); return ret; } if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { group0_trap = true; group1_trap = true; } if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK; group0_trap = true; group1_trap = true; if (ich_vtr_el2 & ICH_VTR_TDS_MASK) dir_trap = true; else common_trap = true; } if (group0_trap || group1_trap || common_trap | dir_trap) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", common_trap ? "C" : "", dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; return 0; } void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); WARN_ON(vgic_v4_load(vcpu)); } void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); } |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Generic I/O port emulation. * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __ASM_GENERIC_IO_H #define __ASM_GENERIC_IO_H #include <asm/page.h> /* I/O is all done through memory accesses */ #include <linux/string.h> /* for memset() and memcpy() */ #include <linux/sizes.h> #include <linux/types.h> #include <linux/instruction_pointer.h> #ifdef CONFIG_GENERIC_IOMAP #include <asm-generic/iomap.h> #endif #include <asm/mmiowb.h> #include <asm-generic/pci_iomap.h> #ifndef __io_br #define __io_br() barrier() #endif /* prevent prefetching of coherent DMA data ahead of a dma-complete */ #ifndef __io_ar #ifdef rmb #define __io_ar(v) rmb() #else #define __io_ar(v) barrier() #endif #endif /* flush writes to coherent DMA data before possibly triggering a DMA read */ #ifndef __io_bw #ifdef wmb #define __io_bw() wmb() #else #define __io_bw() barrier() #endif #endif /* serialize device access against a spin_unlock, usually handled there. */ #ifndef __io_aw #define __io_aw() mmiowb_set_pending() #endif #ifndef __io_pbw #define __io_pbw() __io_bw() #endif #ifndef __io_paw #define __io_paw() __io_aw() #endif #ifndef __io_pbr #define __io_pbr() __io_br() #endif #ifndef __io_par #define __io_par(v) __io_ar(v) #endif /* * "__DISABLE_TRACE_MMIO__" flag can be used to disable MMIO tracing for * specific kernel drivers in case of excessive/unwanted logging. * * Usage: Add a #define flag at the beginning of the driver file. * Ex: #define __DISABLE_TRACE_MMIO__ * #include <...> * ... */ #if IS_ENABLED(CONFIG_TRACE_MMIO_ACCESS) && !(defined(__DISABLE_TRACE_MMIO__)) #include <linux/tracepoint-defs.h> DECLARE_TRACEPOINT(rwmmio_write); DECLARE_TRACEPOINT(rwmmio_post_write); DECLARE_TRACEPOINT(rwmmio_read); DECLARE_TRACEPOINT(rwmmio_post_read); void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0); void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0); void log_read_mmio(u8 width, const volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0); void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0); #else static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0) {} static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0) {} static inline void log_read_mmio(u8 width, const volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0) {} static inline void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, unsigned long caller_addr, unsigned long caller_addr0) {} #endif /* CONFIG_TRACE_MMIO_ACCESS */ /* * __raw_{read,write}{b,w,l,q}() access memory in native endianness. * * On some architectures memory mapped IO needs to be accessed differently. * On the simple architectures, we just read/write the memory location * directly. */ #ifndef __raw_readb #define __raw_readb __raw_readb static inline u8 __raw_readb(const volatile void __iomem *addr) { return *(const volatile u8 __force *)addr; } #endif #ifndef __raw_readw #define __raw_readw __raw_readw static inline u16 __raw_readw(const volatile void __iomem *addr) { return *(const volatile u16 __force *)addr; } #endif #ifndef __raw_readl #define __raw_readl __raw_readl static inline u32 __raw_readl(const volatile void __iomem *addr) { return *(const volatile u32 __force *)addr; } #endif #ifdef CONFIG_64BIT #ifndef __raw_readq #define __raw_readq __raw_readq static inline u64 __raw_readq(const volatile void __iomem *addr) { return *(const volatile u64 __force *)addr; } #endif #endif /* CONFIG_64BIT */ #ifndef __raw_writeb #define __raw_writeb __raw_writeb static inline void __raw_writeb(u8 value, volatile void __iomem *addr) { *(volatile u8 __force *)addr = value; } #endif #ifndef __raw_writew #define __raw_writew __raw_writew static inline void __raw_writew(u16 value, volatile void __iomem *addr) { *(volatile u16 __force *)addr = value; } #endif #ifndef __raw_writel #define __raw_writel __raw_writel static inline void __raw_writel(u32 value, volatile void __iomem *addr) { *(volatile u32 __force *)addr = value; } #endif #ifdef CONFIG_64BIT #ifndef __raw_writeq #define __raw_writeq __raw_writeq static inline void __raw_writeq(u64 value, volatile void __iomem *addr) { *(volatile u64 __force *)addr = value; } #endif #endif /* CONFIG_64BIT */ /* * {read,write}{b,w,l,q}() access little endian memory and return result in * native endianness. */ #ifndef readb #define readb readb static inline u8 readb(const volatile void __iomem *addr) { u8 val; log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __raw_readb(addr); __io_ar(val); log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifndef readw #define readw readw static inline u16 readw(const volatile void __iomem *addr) { u16 val; log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); __io_ar(val); log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifndef readl #define readl readl static inline u32 readl(const volatile void __iomem *addr) { u32 val; log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); __io_ar(val); log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifdef CONFIG_64BIT #ifndef readq #define readq readq static inline u64 readq(const volatile void __iomem *addr) { u64 val; log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); __io_br(); val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); __io_ar(val); log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); return val; } #endif #endif /* CONFIG_64BIT */ #ifndef writeb #define writeb writeb static inline void writeb(u8 value, volatile void __iomem *addr) { log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writeb(value, addr); __io_aw(); log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); } #endif #ifndef writew #define writew writew static inline void writew(u16 value, volatile void __iomem *addr) { log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writew((u16 __force)cpu_to_le16(value), addr); __io_aw(); log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); } #endif #ifndef writel #define writel writel static inline void writel(u32 value, volatile void __iomem *addr) { log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writel((u32 __force)__cpu_to_le32(value), addr); __io_aw(); log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); } #endif #ifdef CONFIG_64BIT #ifndef writeq #define writeq writeq static inline void writeq(u64 value, volatile void __iomem *addr) { log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); __io_bw(); __raw_writeq((u64 __force)__cpu_to_le64(value), addr); __io_aw(); log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); } #endif #endif /* CONFIG_64BIT */ /* * {read,write}{b,w,l,q}_relaxed() are like the regular version, but * are not guaranteed to provide ordering against spinlocks or memory * accesses. */ #ifndef readb_relaxed #define readb_relaxed readb_relaxed static inline u8 readb_relaxed(const volatile void __iomem *addr) { u8 val; log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); val = __raw_readb(addr); log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifndef readw_relaxed #define readw_relaxed readw_relaxed static inline u16 readw_relaxed(const volatile void __iomem *addr) { u16 val; log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifndef readl_relaxed #define readl_relaxed readl_relaxed static inline u32 readl_relaxed(const volatile void __iomem *addr) { u32 val; log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); return val; } #endif #if defined(readq) && !defined(readq_relaxed) #define readq_relaxed readq_relaxed static inline u64 readq_relaxed(const volatile void __iomem *addr) { u64 val; log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); return val; } #endif #ifndef writeb_relaxed #define writeb_relaxed writeb_relaxed static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) { log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); __raw_writeb(value, addr); log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); } #endif #ifndef writew_relaxed #define writew_relaxed writew_relaxed static inline void writew_relaxed(u16 value, volatile void __iomem *addr) { log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); __raw_writew((u16 __force)cpu_to_le16(value), addr); log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); } #endif #ifndef writel_relaxed #define writel_relaxed writel_relaxed static inline void writel_relaxed(u32 value, volatile void __iomem *addr) { log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); __raw_writel((u32 __force)__cpu_to_le32(value), addr); log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); } #endif #if defined(writeq) && !defined(writeq_relaxed) #define writeq_relaxed writeq_relaxed static inline void writeq_relaxed(u64 value, volatile void __iomem *addr) { log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); __raw_writeq((u64 __force)__cpu_to_le64(value), addr); log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); } #endif /* * {read,write}s{b,w,l,q}() repeatedly access the same memory address in * native endianness in 8-, 16-, 32- or 64-bit chunks (@count times). */ #ifndef readsb #define readsb readsb static inline void readsb(const volatile void __iomem *addr, void *buffer, unsigned int count) { if (count) { u8 *buf = buffer; do { u8 x = __raw_readb(addr); *buf++ = x; } while (--count); } } #endif #ifndef readsw #define readsw readsw static inline void readsw(const volatile void __iomem *addr, void *buffer, unsigned int count) { if (count) { u16 *buf = buffer; do { u16 x = __raw_readw(addr); *buf++ = x; } while (--count); } } #endif #ifndef readsl #define readsl readsl static inline void readsl(const volatile void __iomem *addr, void *buffer, unsigned int count) { if (count) { u32 *buf = buffer; do { u32 x = __raw_readl(addr); *buf++ = x; } while (--count); } } #endif #ifdef CONFIG_64BIT #ifndef readsq #define readsq readsq static inline void readsq(const volatile void __iomem *addr, void *buffer, unsigned int count) { if (count) { u64 *buf = buffer; do { u64 x = __raw_readq(addr); *buf++ = x; } while (--count); } } #endif #endif /* CONFIG_64BIT */ #ifndef writesb #define writesb writesb static inline void writesb(volatile void __iomem *addr, const void *buffer, unsigned int count) { if (count) { const u8 *buf = buffer; do { __raw_writeb(*buf++, addr); } while (--count); } } #endif #ifndef writesw #define writesw writesw static inline void writesw(volatile void __iomem *addr, const void *buffer, unsigned int count) { if (count) { const u16 *buf = buffer; do { __raw_writew(*buf++, addr); } while (--count); } } #endif #ifndef writesl #define writesl writesl static inline void writesl(volatile void __iomem *addr, const void *buffer, unsigned int count) { if (count) { const u32 *buf = buffer; do { __raw_writel(*buf++, addr); } while (--count); } } #endif #ifdef CONFIG_64BIT #ifndef writesq #define writesq writesq static inline void writesq(volatile void __iomem *addr, const void *buffer, unsigned int count) { if (count) { const u64 *buf = buffer; do { __raw_writeq(*buf++, addr); } while (--count); } } #endif #endif /* CONFIG_64BIT */ #ifndef PCI_IOBASE #define PCI_IOBASE ((void __iomem *)0) #endif #ifndef IO_SPACE_LIMIT #define IO_SPACE_LIMIT 0xffff #endif /* * {in,out}{b,w,l}() access little endian I/O. {in,out}{b,w,l}_p() can be * implemented on hardware that needs an additional delay for I/O accesses to * take effect. */ #if !defined(inb) && !defined(_inb) #define _inb _inb #ifdef CONFIG_HAS_IOPORT static inline u8 _inb(unsigned long addr) { u8 val; __io_pbr(); val = __raw_readb(PCI_IOBASE + addr); __io_par(val); return val; } #else u8 _inb(unsigned long addr) __compiletime_error("inb()) requires CONFIG_HAS_IOPORT"); #endif #endif #if !defined(inw) && !defined(_inw) #define _inw _inw #ifdef CONFIG_HAS_IOPORT static inline u16 _inw(unsigned long addr) { u16 val; __io_pbr(); val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr)); __io_par(val); return val; } #else u16 _inw(unsigned long addr) __compiletime_error("inw() requires CONFIG_HAS_IOPORT"); #endif #endif #if !defined(inl) && !defined(_inl) #define _inl _inl #ifdef CONFIG_HAS_IOPORT static inline u32 _inl(unsigned long addr) { u32 val; __io_pbr(); val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr)); __io_par(val); return val; } #else u32 _inl(unsigned long addr) __compiletime_error("inl() requires CONFIG_HAS_IOPORT"); #endif #endif #if !defined(outb) && !defined(_outb) #define _outb _outb #ifdef CONFIG_HAS_IOPORT static inline void _outb(u8 value, unsigned long addr) { __io_pbw(); __raw_writeb(value, PCI_IOBASE + addr); __io_paw(); } #else void _outb(u8 value, unsigned long addr) __compiletime_error("outb() requires CONFIG_HAS_IOPORT"); #endif #endif #if !defined(outw) && !defined(_outw) #define _outw _outw #ifdef CONFIG_HAS_IOPORT static inline void _outw(u16 value, unsigned long addr) { __io_pbw(); __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr); __io_paw(); } #else void _outw(u16 value, unsigned long addr) __compiletime_error("outw() requires CONFIG_HAS_IOPORT"); #endif #endif #if !defined(outl) && !defined(_outl) #define _outl _outl #ifdef CONFIG_HAS_IOPORT static inline void _outl(u32 value, unsigned long addr) { __io_pbw(); __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr); __io_paw(); } #else void _outl(u32 value, unsigned long addr) __compiletime_error("outl() requires CONFIG_HAS_IOPORT"); #endif #endif #include <linux/logic_pio.h> #ifndef inb #define inb _inb #endif #ifndef inw #define inw _inw #endif #ifndef inl #define inl _inl #endif #ifndef outb #define outb _outb #endif #ifndef outw #define outw _outw #endif #ifndef outl #define outl _outl #endif #ifndef inb_p #define inb_p inb_p static inline u8 inb_p(unsigned long addr) { return inb(addr); } #endif #ifndef inw_p #define inw_p inw_p static inline u16 inw_p(unsigned long addr) { return inw(addr); } #endif #ifndef inl_p #define inl_p inl_p static inline u32 inl_p(unsigned long addr) { return inl(addr); } #endif #ifndef outb_p #define outb_p outb_p static inline void outb_p(u8 value, unsigned long addr) { outb(value, addr); } #endif #ifndef outw_p #define outw_p outw_p static inline void outw_p(u16 value, unsigned long addr) { outw(value, addr); } #endif #ifndef outl_p #define outl_p outl_p static inline void outl_p(u32 value, unsigned long addr) { outl(value, addr); } #endif /* * {in,out}s{b,w,l}{,_p}() are variants of the above that repeatedly access a * single I/O port multiple times. */ #ifndef insb #define insb insb #ifdef CONFIG_HAS_IOPORT static inline void insb(unsigned long addr, void *buffer, unsigned int count) { readsb(PCI_IOBASE + addr, buffer, count); } #else void insb(unsigned long addr, void *buffer, unsigned int count) __compiletime_error("insb() requires HAS_IOPORT"); #endif #endif #ifndef insw #define insw insw #ifdef CONFIG_HAS_IOPORT static inline void insw(unsigned long addr, void *buffer, unsigned int count) { readsw(PCI_IOBASE + addr, buffer, count); } #else void insw(unsigned long addr, void *buffer, unsigned int count) __compiletime_error("insw() requires HAS_IOPORT"); #endif #endif #ifndef insl #define insl insl #ifdef CONFIG_HAS_IOPORT static inline void insl(unsigned long addr, void *buffer, unsigned int count) { readsl(PCI_IOBASE + addr, buffer, count); } #else void insl(unsigned long addr, void *buffer, unsigned int count) __compiletime_error("insl() requires HAS_IOPORT"); #endif #endif #ifndef outsb #define outsb outsb #ifdef CONFIG_HAS_IOPORT static inline void outsb(unsigned long addr, const void *buffer, unsigned int count) { writesb(PCI_IOBASE + addr, buffer, count); } #else void outsb(unsigned long addr, const void *buffer, unsigned int count) __compiletime_error("outsb() requires HAS_IOPORT"); #endif #endif #ifndef outsw #define outsw outsw #ifdef CONFIG_HAS_IOPORT static inline void outsw(unsigned long addr, const void *buffer, unsigned int count) { writesw(PCI_IOBASE + addr, buffer, count); } #else void outsw(unsigned long addr, const void *buffer, unsigned int count) __compiletime_error("outsw() requires HAS_IOPORT"); #endif #endif #ifndef outsl #define outsl outsl #ifdef CONFIG_HAS_IOPORT static inline void outsl(unsigned long addr, const void *buffer, unsigned int count) { writesl(PCI_IOBASE + addr, buffer, count); } #else void outsl(unsigned long addr, const void *buffer, unsigned int count) __compiletime_error("outsl() requires HAS_IOPORT"); #endif #endif #ifndef insb_p #define insb_p insb_p static inline void insb_p(unsigned long addr, void *buffer, unsigned int count) { insb(addr, buffer, count); } #endif #ifndef insw_p #define insw_p insw_p static inline void insw_p(unsigned long addr, void *buffer, unsigned int count) { insw(addr, buffer, count); } #endif #ifndef insl_p #define insl_p insl_p static inline void insl_p(unsigned long addr, void *buffer, unsigned int count) { insl(addr, buffer, count); } #endif #ifndef outsb_p #define outsb_p outsb_p static inline void outsb_p(unsigned long addr, const void *buffer, unsigned int count) { outsb(addr, buffer, count); } #endif #ifndef outsw_p #define outsw_p outsw_p static inline void outsw_p(unsigned long addr, const void *buffer, unsigned int count) { outsw(addr, buffer, count); } #endif #ifndef outsl_p #define outsl_p outsl_p static inline void outsl_p(unsigned long addr, const void *buffer, unsigned int count) { outsl(addr, buffer, count); } #endif #ifndef CONFIG_GENERIC_IOMAP #ifndef ioread8 #define ioread8 ioread8 static inline u8 ioread8(const volatile void __iomem *addr) { return readb(addr); } #endif #ifndef ioread16 #define ioread16 ioread16 static inline u16 ioread16(const volatile void __iomem *addr) { return readw(addr); } #endif #ifndef ioread32 #define ioread32 ioread32 static inline u32 ioread32(const volatile void __iomem *addr) { return readl(addr); } #endif #ifdef CONFIG_64BIT #ifndef ioread64 #define ioread64 ioread64 static inline u64 ioread64(const volatile void __iomem *addr) { return readq(addr); } #endif #endif /* CONFIG_64BIT */ #ifndef iowrite8 #define iowrite8 iowrite8 static inline void iowrite8(u8 value, volatile void __iomem *addr) { writeb(value, addr); } #endif #ifndef iowrite16 #define iowrite16 iowrite16 static inline void iowrite16(u16 value, volatile void __iomem *addr) { writew(value, addr); } #endif #ifndef iowrite32 #define iowrite32 iowrite32 static inline void iowrite32(u32 value, volatile void __iomem *addr) { writel(value, addr); } #endif #ifdef CONFIG_64BIT #ifndef iowrite64 #define iowrite64 iowrite64 static inline void iowrite64(u64 value, volatile void __iomem *addr) { writeq(value, addr); } #endif #endif /* CONFIG_64BIT */ #ifndef ioread16be #define ioread16be ioread16be static inline u16 ioread16be(const volatile void __iomem *addr) { return swab16(readw(addr)); } #endif #ifndef ioread32be #define ioread32be ioread32be static inline u32 ioread32be(const volatile void __iomem *addr) { return swab32(readl(addr)); } #endif #ifdef CONFIG_64BIT #ifndef ioread64be #define ioread64be ioread64be static inline u64 ioread64be(const volatile void __iomem *addr) { return swab64(readq(addr)); } #endif #endif /* CONFIG_64BIT */ #ifndef iowrite16be #define iowrite16be iowrite16be static inline void iowrite16be(u16 value, void volatile __iomem *addr) { writew(swab16(value), addr); } #endif #ifndef iowrite32be #define iowrite32be iowrite32be static inline void iowrite32be(u32 value, volatile void __iomem *addr) { writel(swab32(value), addr); } #endif #ifdef CONFIG_64BIT #ifndef iowrite64be #define iowrite64be iowrite64be static inline void iowrite64be(u64 value, volatile void __iomem *addr) { writeq(swab64(value), addr); } #endif #endif /* CONFIG_64BIT */ #ifndef ioread8_rep #define ioread8_rep ioread8_rep static inline void ioread8_rep(const volatile void __iomem *addr, void *buffer, unsigned int count) { readsb(addr, buffer, count); } #endif #ifndef ioread16_rep #define ioread16_rep ioread16_rep static inline void ioread16_rep(const volatile void __iomem *addr, void *buffer, unsigned int count) { readsw(addr, buffer, count); } #endif #ifndef ioread32_rep #define ioread32_rep ioread32_rep static inline void ioread32_rep(const volatile void __iomem *addr, void *buffer, unsigned int count) { readsl(addr, buffer, count); } #endif #ifdef CONFIG_64BIT #ifndef ioread64_rep #define ioread64_rep ioread64_rep static inline void ioread64_rep(const volatile void __iomem *addr, void *buffer, unsigned int count) { readsq(addr, buffer, count); } #endif #endif /* CONFIG_64BIT */ #ifndef iowrite8_rep #define iowrite8_rep iowrite8_rep static inline void iowrite8_rep(volatile void __iomem *addr, const void *buffer, unsigned int count) { writesb(addr, buffer, count); } #endif #ifndef iowrite16_rep #define iowrite16_rep iowrite16_rep static inline void iowrite16_rep(volatile void __iomem *addr, const void *buffer, unsigned int count) { writesw(addr, buffer, count); } #endif #ifndef iowrite32_rep #define iowrite32_rep iowrite32_rep static inline void iowrite32_rep(volatile void __iomem *addr, const void *buffer, unsigned int count) { writesl(addr, buffer, count); } #endif #ifdef CONFIG_64BIT #ifndef iowrite64_rep #define iowrite64_rep iowrite64_rep static inline void iowrite64_rep(volatile void __iomem *addr, const void *buffer, unsigned int count) { writesq(addr, buffer, count); } #endif #endif /* CONFIG_64BIT */ #endif /* CONFIG_GENERIC_IOMAP */ #ifdef __KERNEL__ #define __io_virt(x) ((void __force *)(x)) /* * Change virtual addresses to physical addresses and vv. * These are pretty trivial */ #ifndef virt_to_phys #define virt_to_phys virt_to_phys static inline unsigned long virt_to_phys(volatile void *address) { return __pa((unsigned long)address); } #endif #ifndef phys_to_virt #define phys_to_virt phys_to_virt static inline void *phys_to_virt(unsigned long address) { return __va(address); } #endif /** * DOC: ioremap() and ioremap_*() variants * * Architectures with an MMU are expected to provide ioremap() and iounmap() * themselves or rely on GENERIC_IOREMAP. For NOMMU architectures we provide * a default nop-op implementation that expect that the physical address used * for MMIO are already marked as uncached, and can be used as kernel virtual * addresses. * * ioremap_wc() and ioremap_wt() can provide more relaxed caching attributes * for specific drivers if the architecture choses to implement them. If they * are not implemented we fall back to plain ioremap. Conversely, ioremap_np() * can provide stricter non-posted write semantics if the architecture * implements them. */ #ifndef CONFIG_MMU #ifndef ioremap #define ioremap ioremap static inline void __iomem *ioremap(phys_addr_t offset, size_t size) { return (void __iomem *)(unsigned long)offset; } #endif #ifndef iounmap #define iounmap iounmap static inline void iounmap(volatile void __iomem *addr) { } #endif #elif defined(CONFIG_GENERIC_IOREMAP) #include <linux/pgtable.h> void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot); void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); #ifndef ioremap #define ioremap ioremap static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ return ioremap_prot(addr, size, _PAGE_IOREMAP); } #endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ #ifndef ioremap_wc #define ioremap_wc ioremap #endif #ifndef ioremap_wt #define ioremap_wt ioremap #endif /* * ioremap_uc is special in that we do require an explicit architecture * implementation. In general you do not want to use this function in a * driver and use plain ioremap, which is uncached by default. Similarly * architectures should not implement it unless they have a very good * reason. */ #ifndef ioremap_uc #define ioremap_uc ioremap_uc static inline void __iomem *ioremap_uc(phys_addr_t offset, size_t size) { return NULL; } #endif /* * ioremap_np needs an explicit architecture implementation, as it * requests stronger semantics than regular ioremap(). Portable drivers * should instead use one of the higher-level abstractions, like * devm_ioremap_resource(), to choose the correct variant for any given * device and bus. Portable drivers with a good reason to want non-posted * write semantics should always provide an ioremap() fallback in case * ioremap_np() is not available. */ #ifndef ioremap_np #define ioremap_np ioremap_np static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size) { return NULL; } #endif #ifdef CONFIG_HAS_IOPORT_MAP #ifndef CONFIG_GENERIC_IOMAP #ifndef ioport_map #define ioport_map ioport_map static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { port &= IO_SPACE_LIMIT; return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port; } #define ARCH_HAS_GENERIC_IOPORT_MAP #endif #ifndef ioport_unmap #define ioport_unmap ioport_unmap static inline void ioport_unmap(void __iomem *p) { } #endif #else /* CONFIG_GENERIC_IOMAP */ extern void __iomem *ioport_map(unsigned long port, unsigned int nr); extern void ioport_unmap(void __iomem *p); #endif /* CONFIG_GENERIC_IOMAP */ #endif /* CONFIG_HAS_IOPORT_MAP */ #ifndef CONFIG_GENERIC_IOMAP #ifndef pci_iounmap #define ARCH_WANTS_GENERIC_PCI_IOUNMAP #endif #endif #ifndef xlate_dev_mem_ptr #define xlate_dev_mem_ptr xlate_dev_mem_ptr static inline void *xlate_dev_mem_ptr(phys_addr_t addr) { return __va(addr); } #endif #ifndef unxlate_dev_mem_ptr #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { } #endif #ifndef memset_io /** * memset_io Set a range of I/O memory to a constant value * @addr: The beginning of the I/O-memory range to set * @val: The value to set the memory to * @count: The number of bytes to set * * Set a range of I/O memory to a given value. */ void memset_io(volatile void __iomem *addr, int val, size_t count); #endif #ifndef memcpy_fromio /** * memcpy_fromio Copy a block of data from I/O memory * @dst: The (RAM) destination for the copy * @src: The (I/O memory) source for the data * @count: The number of bytes to copy * * Copy a block of data from I/O memory. */ void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count); #endif #ifndef memcpy_toio /** * memcpy_toio Copy a block of data into I/O memory * @dst: The (I/O memory) destination for the copy * @src: The (RAM) source for the data * @count: The number of bytes to copy * * Copy a block of data to I/O memory. */ void memcpy_toio(volatile void __iomem *dst, const void *src, size_t count); #endif extern int devmem_is_allowed(unsigned long pfn); #endif /* __KERNEL__ */ #endif /* __ASM_GENERIC_IO_H */ |
18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */ |
191 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be * applied to the leaves of pgtables. */ #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 4 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) #else #define SWP_DEVICE_NUM 0 #endif /* * Page migration support. * * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and * indicates that the referenced (part of) an anonymous page is exclusive to * a single process. For SWP_MIGRATION_WRITE, that information is implicit: * (part of) an anonymous page that are mapped writable are exclusive to a * single process. */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 3 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; /* * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based * reclaim * @pages: number of pages reclaimed * * If the current process is undergoing a reclaim operation, increment the * number of reclaimed pages by @pages. */ static inline void mm_account_reclaimed_pages(unsigned long pages) { if (current->reclaim_state) current->reclaim_state->reclaimed += pages; } #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ }; #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The flags field determines if a cluster is free. This is * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * other than list, and swap_info_struct->swap_map * elements corresponding to the swap cluster. */ u16 count; u8 flags; u8 order; struct list_head list; }; /* All on-list cluster must have a non-zero flag. */ enum swap_cluster_flags { CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ CLUSTER_FLAG_FREE, CLUSTER_FLAG_NONFULL, CLUSTER_FLAG_FRAG, /* Clusters with flags above are allocatable */ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, CLUSTER_FLAG_FULL, CLUSTER_FLAG_DISCARD, CLUSTER_FLAG_MAX, }; /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as * a sentinel to indicate an entry is not valid. */ #define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) #else #define SWAP_NR_ORDERS 1 #endif /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from * its own cluster and swapout sequentially. The purpose is to optimize swapout * throughput. */ struct percpu_cluster { local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, * cluster_nr, lowest_alloc, * highest_alloc, free/discard cluster * list. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. * Must be last as the number of the * array is nr_node_ids, which is not * a fixed value so have to allocate * dynamically. * And it has to be an array so that * plist_for_each_* can work. */ }; static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) { return atomic_read(&lru_disable_count); } static inline void lru_cache_enable(void) { atomic_dec(&lru_disable_count); } extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #else #define node_reclaim_mode 0 #endif static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } void free_swap_cache(struct folio *folio); void free_page_and_swap_cache(struct page *); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; } static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; } static inline void put_swap_device(struct swap_info_struct *si) { } #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) /* only sparc can not include linux/pagemap.h in this file * so leave put_page and release_pages undeclared... */ #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) { } static inline void free_swap_cache(struct folio *folio) { } static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } static inline int swap_duplicate(swp_entry_t swp) { return 0; } static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { return 0; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; entry.val = 0; return entry; } static inline bool folio_free_swap(struct folio *folio) { return false; } static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { return -EINVAL; } #endif /* CONFIG_SWAP */ static inline void free_swap_and_cache(swp_entry_t entry) { free_swap_and_cache_nr(entry, 1); } static inline void swap_free(swp_entry_t entry) { swap_free_nr(entry, 1); } #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return READ_ONCE(vm_swappiness); return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return READ_ONCE(vm_swappiness); } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; __folio_throttle_swaprate(folio, gfp); } #else static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { } #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_swap(entry, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { } static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ |
5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/crypto/hooks.c * * Encryption hooks for higher-level filesystem operations. */ #include "fscrypt_private.h" /** * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * * Currently, an encrypted regular file can only be opened if its encryption key * is available; access to the raw encrypted contents is not supported. * Therefore, we first set up the inode's encryption key (if not already done) * and return an error if it's unavailable. * * We also verify that if the parent directory (from the path via which the file * is being opened) is encrypted, then the inode being opened uses the same * encryption policy. This is needed as part of the enforcement that all files * in an encrypted directory tree use the same encryption policy, as a * protection against certain types of offline attacks. Note that this check is * needed even when opening an *unencrypted* file, since it's forbidden to have * an unencrypted file in an encrypted directory. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code */ int fscrypt_file_open(struct inode *inode, struct file *filp) { int err; struct dentry *dentry, *dentry_parent; struct inode *inode_parent; err = fscrypt_require_key(inode); if (err) return err; dentry = file_dentry(filp); /* * Getting a reference to the parent dentry is needed for the actual * encryption policy comparison, but it's expensive on multi-core * systems. Since this function runs on unencrypted files too, start * with a lightweight RCU-mode check for the parent directory being * unencrypted (in which case it's fine for the child to be either * unencrypted, or encrypted with any policy). Only continue on to the * full policy check if the parent directory is actually encrypted. */ rcu_read_lock(); dentry_parent = READ_ONCE(dentry->d_parent); inode_parent = d_inode_rcu(dentry_parent); if (inode_parent != NULL && !IS_ENCRYPTED(inode_parent)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); dentry_parent = dget_parent(dentry); if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) { fscrypt_warn(inode, "Inconsistent encryption context (parent directory: %lu)", d_inode(dentry_parent)->i_ino); err = -EPERM; } dput(dentry_parent); return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inode's key is * available, as it's implied by the dentry not being a no-key name. */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (fscrypt_is_nokey_name(old_dentry) || fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inodes' keys are * available, as it's implied by the dentries not being no-key names. */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) return -EXDEV; } return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname); if (err && err != -ENOENT) return err; fscrypt_prepare_dentry(dentry, fname->is_nokey_name); return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); /** * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup * @dir: the encrypted directory being searched * @dentry: the dentry being looked up in @dir * * This function should be used by the ->lookup and ->atomic_open methods of * filesystems that handle filename encryption and no-key name encoding * themselves and thus can't use fscrypt_prepare_lookup(). Like * fscrypt_prepare_lookup(), this will try to set up the directory's encryption * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. * However, this function doesn't set up a struct fscrypt_name for the filename. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); fscrypt_prepare_dentry(dentry, is_nokey_name); return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); } EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return fscrypt_require_key(d_inode(dentry)); return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed * @oldflags: the old flags * @flags: the new flags * * The caller should be holding i_rwsem for write. * * Return: 0 on success; -errno if the flags change isn't allowed or if * another error occurs. */ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; /* * When the CASEFOLD flag is set on an encrypted directory, we must * derive the secret key needed for the dirhash. This is only possible * if the directory uses a v2 encryption policy. */ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { err = fscrypt_require_key(inode); if (err) return err; ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); return err; } return 0; } /** * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator * @max_len: space the filesystem has available to store the symlink target * @disk_link: (out) the on-disk symlink target being prepared * * This function computes the size the symlink target will require on-disk, * stores it in @disk_link->len, and validates it against @max_len. An * encrypted symlink may be longer than the original. * * Additionally, @disk_link->name is set to @target if the symlink will be * unencrypted, but left NULL if the symlink will be encrypted. For encrypted * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the * on-disk target later. (The reason for the two-step process is that some * filesystems need to know the size of the symlink target before creating the * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) * * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, * -ENOKEY if the encryption key is missing, or another -errno code if a problem * occurred while setting up the encryption key. */ int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. */ policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) { /* Not encrypted */ disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } if (IS_ERR(policy)) return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't * exceed max_len. Note that for historical reasons, encrypted symlink * targets are prefixed with the ciphertext length, despite this * actually being redundant with i_size. This decreases by 2 bytes the * longest symlink target we can accept. * * We could recover 1 byte by not counting a null terminator, but * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; } EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; /* * fscrypt_prepare_new_inode() should have already set up the new * symlink inode's encryption key. We don't wait until now to do it, * since we may be in a filesystem transaction now. */ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ sd = (struct fscrypt_symlink_data *)disk_link->name; } else { sd = kmalloc(disk_link->len, GFP_NOFS); if (!sd) return -ENOMEM; } ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) goto err_free_sd; /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well * initialize it just in case the filesystem writes it out. */ sd->encrypted_path[ciphertext_len] = '\0'; /* Cache the plaintext symlink target for later use by get_link() */ err = -ENOMEM; inode->i_link = kmemdup(target, len + 1, GFP_NOFS); if (!inode->i_link) goto err_free_sd; if (!disk_link->name) disk_link->name = (unsigned char *)sd; return 0; err_free_sd: if (!disk_link->name) kfree(sd); return err; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer * @done: if successful, will be set up to free the returned target if needed * * If the symlink's encryption key is available, we decrypt its target. * Otherwise, we encode its target for presentation. * * This may sleep, so the filesystem must have dropped out of RCU mode already. * * Return: the presentable symlink target or an ERR_PTR() */ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { const struct fscrypt_symlink_data *sd; struct fscrypt_str cstr, pstr; bool has_key; int err; /* This is for encrypted symlinks only */ if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ pstr.name = READ_ONCE(inode->i_link); if (pstr.name) return pstr.name; /* * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); /* * For historical reasons, encrypted symlink targets are prefixed with * the ciphertext length, even though this is redundant with i_size. */ if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); if (cstr.len == 0) return ERR_PTR(-EUCLEAN); if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (err) goto err_kfree; err = -EUCLEAN; if (pstr.name[0] == '\0') goto err_kfree; pstr.name[pstr.len] = '\0'; /* * Cache decrypted symlink targets in i_link for later use. Don't cache * symlink targets encoded without the key, since those become outdated * once the key is added. This pairs with the READ_ONCE() above and in * the VFS path lookup code. */ if (!has_key || cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) set_delayed_call(done, kfree_link, pstr.name); return pstr.name; err_kfree: kfree(pstr.name); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); /** * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks * @path: the path for the encrypted symlink being queried * @stat: the struct being filled with the symlink's attributes * * Override st_size of encrypted symlinks to be the length of the decrypted * symlink target (or the no-key encoded symlink target, if the key is * unavailable) rather than the length of the encrypted symlink target. This is * necessary for st_size to match the symlink target that userspace actually * sees. POSIX requires this, and some userspace programs depend on it. * * This requires reading the symlink target from disk if needed, setting up the * inode's encryption key if possible, and then decrypting or encoding the * symlink target. This makes lstat() more heavyweight than is normally the * case. However, decrypted symlink targets will be cached in ->i_link, so * usually the symlink won't have to be read and decrypted again later if/when * it is actually followed, readlink() is called, or lstat() is called again. * * Return: 0 on success, -errno on failure */ int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); const char *link; DEFINE_DELAYED_CALL(done); /* * To get the symlink target that userspace will see (whether it's the * decrypted target or the no-key encoded target), we can just get it in * the same way the VFS does during path resolution and readlink(). */ link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } stat->size = strlen(link); do_delayed_call(&done); return 0; } EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); |
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * suspend_console - suspend the console subsystem * * This disables printk() while we go into suspend states */ void suspend_console(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void resume_console(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can disable console output before suspending a port, and can * re-enable output afterwards. */ void console_stop(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); bool is_printk_cpu_sync_owner(void) { return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); } /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ |
49 48 27 81 95 65 76 2 65 21 69 11 48 339 83 83 48 45 37 14 2 35 15 8 30 15 7 15 1 14 1 1 13 60 16 52 8 6 64 52 26 64 37 34 13 218 13 10 308 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __KVM_HOST_H #define __KVM_HOST_H #include <linux/types.h> #include <linux/hardirq.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/bug.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/preempt.h> #include <linux/msi.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/rcupdate.h> #include <linux/ratelimit.h> #include <linux/err.h> #include <linux/irqflags.h> #include <linux/context_tracking.h> #include <linux/irqbypass.h> #include <linux/rcuwait.h> #include <linux/refcount.h> #include <linux/nospec.h> #include <linux/notifier.h> #include <linux/ftrace.h> #include <linux/hashtable.h> #include <linux/instrumentation.h> #include <linux/interval_tree.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <asm/signal.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> #ifndef KVM_MAX_VCPU_IDS #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) /* * Bit 63 of the memslot generation number is an "update in-progress flag", * e.g. is temporarily set for the duration of kvm_swap_active_memslots(). * This flag effectively creates a unique generation number that is used to * mark cached memslot data, e.g. MMIO accesses, as potentially being stale, * i.e. may (or may not) have come from the previous memslots generation. * * This is necessary because the actual memslots update is not atomic with * respect to the generation number update. Updating the generation number * first would allow a vCPU to cache a spte from the old memslots using the * new generation number, and updating the generation number after switching * to the new memslots would allow cache hits using the old generation number * to reference the defunct memslots. * * This mechanism is used to prevent getting hits in KVM's caches while a * memslot update is in-progress, and to prevent cache hits *after* updating * the actual generation number against accesses that were inserted into the * cache *before* the memslots were updated. */ #define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63) /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 #ifndef KVM_MAX_NR_ADDRESS_SPACES #define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* * For the normal pfn, the highest 12 bits should be zero, * so we can mask bit 62 ~ bit 52 to indicate the error pfn, * mask bit 63 to indicate the noslot pfn. */ #define KVM_PFN_ERR_MASK (0x7ffULL << 52) #define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) #define KVM_PFN_NOSLOT (0x1ULL << 63) #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) #define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to * translate it to pfn on host. */ static inline bool is_error_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } /* * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted * by a pending signal. Note, the signal may or may not be fatal. */ static inline bool is_sigpending_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_ERR_SIGPENDING; } /* * error_noslot pfns indicate that the gfn can not be * translated to pfn - it is not in slot or failed to * translate it to pfn. */ static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ static inline bool is_noslot_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_NOSLOT; } /* * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390) * provide own defines and kvm_is_error_hva */ #ifndef KVM_HVA_ERR_BAD #define KVM_HVA_ERR_BAD (PAGE_OFFSET) #define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) static inline bool kvm_is_error_hva(unsigned long addr) { return addr >= PAGE_OFFSET; } #endif static inline bool kvm_is_error_gpa(gpa_t gpa) { return gpa == INVALID_GPA; } #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) #define KVM_REQUEST_WAIT BIT(9) #define KVM_REQUEST_NO_ACTION BIT(10) /* * Architecture-independent vcpu->requests bit members * Bits 3-7 are reserved for more arch-independent bits. */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VM_DEAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_DIRTY_RING_SOFT_FULL 3 #define KVM_REQUEST_ARCH_BASE 8 /* * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to * OUTSIDE_GUEST_MODE. KVM_REQ_OUTSIDE_GUEST_MODE differs from a vCPU "kick" * in that it ensures the vCPU has reached OUTSIDE_GUEST_MODE before continuing * on. A kick only guarantees that the vCPU is on its way out, e.g. a previous * kick may have set vcpu->mode to EXITING_GUEST_MODE, and so there's no * guarantee the vCPU received an IPI and has actually exited guest mode. */ #define KVM_REQ_OUTSIDE_GUEST_MODE (KVM_REQUEST_NO_ACTION | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ BUILD_BUG_ON((unsigned)(nr) >= (sizeof_field(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \ (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \ }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 extern struct mutex kvm_lock; extern struct list_head vm_list; struct kvm_io_range { gpa_t addr; int len; struct kvm_io_device *dev; }; #define NR_IOBUS_DEVS 1000 struct kvm_io_bus { int dev_count; int ioeventfd_count; struct kvm_io_range range[]; }; enum kvm_bus { KVM_MMIO_BUS, KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, KVM_IOCSR_BUS, KVM_NR_BUSES }; int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie); int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { struct work_struct work; struct list_head link; struct list_head queue; struct kvm_vcpu *vcpu; gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; bool wakeup_all; bool notpresent_injected; }; void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { unsigned long attributes; }; enum kvm_gfn_range_filter { KVM_FILTER_SHARED = BIT(0), KVM_FILTER_PRIVATE = BIT(1), }; struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); #endif enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, EXITING_GUEST_MODE, READING_SHADOW_PAGE_TABLES, }; struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). */ struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; bool writable; }; /* * Used to check if the mapping is valid or not. Never use 'kvm_host_map' * directly to check for that. */ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) { return !!map->hva; } static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) { return single_task_running() && !need_resched() && ktime_before(cur, stop); } /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. */ struct kvm_mmio_fragment { gpa_t gpa; void *data; unsigned len; }; struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; #endif int cpu; int vcpu_id; /* id given by userspace at creation */ int vcpu_idx; /* index into kvm->vcpu_array */ int ____srcu_idx; /* Don't use this directly. You've been warned. */ #ifdef CONFIG_PROVE_RCU int srcu_depth; #endif int mode; u64 requests; unsigned long guest_debug; struct mutex mutex; struct kvm_run *run; #ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; #endif struct pid *pid; rwlock_t pid_lock; int sigset_active; sigset_t sigset; unsigned int halt_poll_ns; bool valid_wakeup; #ifdef CONFIG_HAS_IOMEM int mmio_needed; int mmio_read_completed; int mmio_is_write; int mmio_cur_fragment; int mmio_nr_fragments; struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; #endif #ifdef CONFIG_KVM_ASYNC_PF struct { u32 queued; struct list_head queue; struct list_head done; spinlock_t lock; } async_pf; #endif #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Cpu relax intercept or pause loop exit optimization * in_spin_loop: set when a vcpu does a pause loop exit * or cpu relax intercepted. * dy_eligible: indicates whether vcpu is eligible for directed yield. */ struct { bool in_spin_loop; bool dy_eligible; } spin_loop; #endif bool wants_to_run; bool preempted; bool ready; bool scheduled_out; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; /* * The most recently used memslot by this vCPU and the slots generation * for which it is valid. * No wraparound protection is needed since generations won't overflow in * thousands of years, even assuming 1M memslot operations per second. */ struct kvm_memory_slot *last_used_slot; u64 last_used_slot_gen; }; /* * Start accounting time towards a guest. * Must be called before entering guest context. */ static __always_inline void guest_timing_enter_irqoff(void) { /* * This is running in ioctl context so its safe to assume that it's the * stime pending cputime to flush. */ instrumentation_begin(); vtime_account_guest_enter(); instrumentation_end(); } /* * Enter guest context and enter an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_enter_irqoff(void) { /* * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ if (!context_tracking_guest_enter()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Deprecated. Architectures should move to guest_timing_enter_irqoff() and * guest_state_enter_irqoff(). */ static __always_inline void guest_enter_irqoff(void) { guest_timing_enter_irqoff(); guest_context_enter_irqoff(); } /** * guest_state_enter_irqoff - Fixup state when entering a guest * * Entry to a guest will enable interrupts, but the kernel state is interrupts * disabled when this is invoked. Also tell RCU about it. * * 1) Trace interrupts on state * 2) Invoke context tracking if enabled to adjust RCU state * 3) Tell lockdep that interrupts are enabled * * Invoked from architecture specific code before entering a guest. * Must be called with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_enter_irqoff() before this. * * Note: this is analogous to exit_to_user_mode(). */ static __always_inline void guest_state_enter_irqoff(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); guest_context_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); } /* * Exit guest context and exit an RCU extended quiescent state. * * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is * unsafe to use any code which may directly or indirectly use RCU, tracing * (including IRQ flag tracing), or lockdep. All code in this period must be * non-instrumentable. */ static __always_inline void guest_context_exit_irqoff(void) { /* * Guest mode is treated as a quiescent state, see * guest_context_enter_irqoff() for more details. */ if (!context_tracking_guest_exit()) { instrumentation_begin(); rcu_virt_note_context_switch(); instrumentation_end(); } } /* * Stop accounting time towards a guest. * Must be called after exiting guest context. */ static __always_inline void guest_timing_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } /* * Deprecated. Architectures should move to guest_state_exit_irqoff() and * guest_timing_exit_irqoff(). */ static __always_inline void guest_exit_irqoff(void) { guest_context_exit_irqoff(); guest_timing_exit_irqoff(); } static inline void guest_exit(void) { unsigned long flags; local_irq_save(flags); guest_exit_irqoff(); local_irq_restore(flags); } /** * guest_state_exit_irqoff - Establish state when returning from guest mode * * Entry from a guest disables interrupts, but guest mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. * * 1) Tell lockdep that interrupts are disabled * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state * * Invoked from architecture specific code after exiting a guest. * Must be invoked with interrupts disabled and the caller must be * non-instrumentable. * The caller has to invoke guest_timing_exit_irqoff() after this. * * Note: this is analogous to enter_from_user_mode(). */ static __always_inline void guest_state_exit_irqoff(void) { lockdep_hardirqs_off(CALLER_ADDR0); guest_context_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); } static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* * The memory barrier ensures a previous write to vcpu->requests cannot * be reordered with the read of vcpu->mode. It pairs with the general * memory barrier following the write of vcpu->mode in VCPU RUN. */ smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } /* * Some of the bitops functions do not support too long bitmaps. * This number must be determined not to exceed such limits. */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) /* * Since at idle each memslot belongs to two memslot sets it has to contain * two embedded nodes for each data structure that it forms a part of. * * Two memslot sets (one active and one inactive) are necessary so the VM * continues to run on one memslot set while the other is being modified. * * These two memslot sets normally point to the same set of memslots. * They can, however, be desynchronized when performing a memslot management * operation by replacing the memslot to be modified by its copy. * After the operation is complete, both memslot sets once again point to * the same, common set of memslot data. * * The memslots themselves are independent of each other so they can be * individually added or deleted. */ struct kvm_memory_slot { struct hlist_node id_node[2]; struct interval_tree_node hva_node[2]; struct rb_node gfn_node[2]; gfn_t base_gfn; unsigned long npages; unsigned long *dirty_bitmap; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; u32 flags; short id; u16 as_id; #ifdef CONFIG_KVM_PRIVATE_MEM struct { /* * Writes protected by kvm->slots_lock. Acquiring a * reference via kvm_gmem_get_file() is protected by * either kvm->slots_lock or kvm->srcu. */ struct file *file; pgoff_t pgoff; } gmem; #endif }; static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; } static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) { return ALIGN(memslot->npages, BITS_PER_LONG) / 8; } static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long len = kvm_dirty_bitmap_bytes(memslot); return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); } #ifndef KVM_DIRTY_LOG_MANUAL_CAPS #define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE #endif struct kvm_s390_adapter_int { u64 ind_addr; u64 summary_addr; u64 ind_offset; u32 summary_offset; u32 adapter_id; }; struct kvm_hv_sint { u32 vcpu; u32 sint; }; struct kvm_xen_evtchn { u32 port; u32 vcpu_id; int vcpu_idx; u32 priority; }; struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); union { struct { unsigned irqchip; unsigned pin; } irqchip; struct { u32 address_lo; u32 address_hi; u32 data; u32 flags; u32 devid; } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING struct kvm_irq_routing_table { int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; u32 nr_rt_entries; /* * Array indexed by gsi. Each entry contains list of irq chips * the gsi is connected to. */ struct hlist_head map[] __counted_by(nr_rt_entries); }; #endif bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #ifndef KVM_INTERNAL_MEM_SLOTS #define KVM_INTERNAL_MEM_SLOTS 0 #endif #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) #if KVM_MAX_NR_ADDRESS_SPACES == 1 static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) { return KVM_MAX_NR_ADDRESS_SPACES; } static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; } #endif /* * Arch code must define kvm_arch_has_private_mem if support for private memory * is enabled. */ #if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; } #endif #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM); } #endif struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; struct rb_root_cached hva_tree; struct rb_root gfn_tree; /* * The mapping table from slot id to memslot. * * 7-bit bucket count matches the size of the old id to index array for * 512 slots, while giving good performance with this slot count. * Higher bucket counts bring only small performance improvements but * always result in higher memory usage (even for lower memslot counts). */ DECLARE_HASHTABLE(id_hash, 7); int node_idx; }; struct kvm { #ifdef KVM_HAVE_MMU_RWLOCK rwlock_t mmu_lock; #else spinlock_t mmu_lock; #endif /* KVM_HAVE_MMU_RWLOCK */ struct mutex slots_lock; /* * Protects the arch-specific fields of struct kvm_memory_slots in * use by the VM. To be used under the slots_lock (above) or in a * kvm->srcu critical section where acquiring the slots_lock would * lead to deadlock with the synchronize_srcu in * kvm_swap_active_memslots(). */ struct mutex slots_arch_lock; struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an * incorrect answer is acceptable. */ atomic_t nr_memslots_dirty_logging; /* Used to wait for completion of MMU notifiers. */ spinlock_t mn_invalidate_lock; unsigned long mn_active_invalidate_count; struct rcuwait mn_memslots_update_rcuwait; /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only * incremented after storing the kvm_vcpu pointer in vcpus, * and is accessed atomically. */ atomic_t online_vcpus; int max_vcpus; int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_IRQCHIP struct { spinlock_t lock; struct list_head items; /* resampler_list update side is protected by resampler_lock. */ struct list_head resampler_list; struct mutex resampler_lock; } irqfds; #endif struct list_head ioeventfds; struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; #ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; #endif struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP /* * Update side is protected by irq_lock. */ struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head irq_ack_notifier_list; #endif #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; gfn_t mmu_invalidate_range_start; gfn_t mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; bool override_halt_poll_ns; unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES /* Protected by slots_locks (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; }; #define kvm_err(fmt, ...) \ pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_info(fmt, ...) \ pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug(fmt, ...) \ pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_debug_ratelimited(fmt, ...) \ pr_debug_ratelimited("kvm [%i]: " fmt, task_pid_nr(current), \ ## __VA_ARGS__) #define kvm_pr_unimpl(fmt, ...) \ pr_err_ratelimited("kvm [%i]: " fmt, \ task_tgid_nr(current), ## __VA_ARGS__) /* The guest did something we don't support. */ #define vcpu_unimpl(vcpu, fmt, ...) \ kvm_pr_unimpl("vcpu%i, guest rIP: 0x%lx " fmt, \ (vcpu)->vcpu_id, kvm_rip_read(vcpu), ## __VA_ARGS__) #define vcpu_debug(vcpu, fmt, ...) \ kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) #define vcpu_debug_ratelimited(vcpu, fmt, ...) \ kvm_debug_ratelimited("vcpu%i " fmt, (vcpu)->vcpu_id, \ ## __VA_ARGS__) #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) static inline void kvm_vm_dead(struct kvm *kvm) { kvm->vm_dead = true; kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); } static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; kvm_vm_dead(kvm); } #define KVM_BUG(cond, kvm, fmt...) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) #define KVM_BUG_ON(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) /* * Note, "data corruption" refers to corruption of host kernel data structures, * not guest data. Guest data corruption, suspected or confirmed, that is tied * and contained to a single VM should *never* BUG() and potentially panic the * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure * is corrupted and that corruption can have a cascading effect to other parts * of the hosts and/or to other VMs. */ #define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm) \ ({ \ bool __ret = !!(cond); \ \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) \ BUG_ON(__ret); \ else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ unlikely(__ret); \ }) static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PROVE_RCU WARN_ONCE(vcpu->srcu_depth++, "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1); #endif vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu) { srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx); #ifdef CONFIG_PROVE_RCU WARN_ONCE(--vcpu->srcu_depth, "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth); #endif } static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { int num_vcpus = atomic_read(&kvm->online_vcpus); /* * Explicitly verify the target vCPU is online, as the anti-speculation * logic only limits the CPU's ability to speculate, e.g. given a "bad" * index, clamping the index to 0 would return vCPU0, not NULL. */ if (i >= num_vcpus) return NULL; i = array_index_nospec(i, num_vcpus); /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return xa_load(&kvm->vcpu_array, i); } #define kvm_for_each_vcpu(idx, vcpup, kvm) \ if (atomic_read(&kvm->online_vcpus)) \ xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (id < 0) return NULL; if (id < KVM_MAX_VCPUS) vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (vcpu->vcpu_id == id) return vcpu; return NULL; } void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) { } #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else static inline int kvm_irqfd_init(void) { return 0; } static inline void kvm_irqfd_exit(void) { } #endif int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) { return __kvm_memslots(kvm, 0); } static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu) { int as_id = kvm_arch_vcpu_memslots_id(vcpu); return __kvm_memslots(vcpu->kvm, as_id); } static inline bool kvm_memslots_empty(struct kvm_memslots *slots) { return RB_EMPTY_ROOT(&slots->gfn_tree); } bool kvm_are_all_memslots_empty(struct kvm *kvm); #define kvm_for_each_memslot(memslot, bkt, slots) \ hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ if (WARN_ON_ONCE(!memslot->npages)) { \ } else static inline struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id) { struct kvm_memory_slot *slot; int idx = slots->node_idx; hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) { if (slot->id == id) return slot; } return NULL; } /* Iterator used for walking memslots that overlap a gfn range. */ struct kvm_memslot_iter { struct kvm_memslots *slots; struct rb_node *node; struct kvm_memory_slot *slot; }; static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter) { iter->node = rb_next(iter->node); if (!iter->node) return; iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]); } static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter, struct kvm_memslots *slots, gfn_t start) { int idx = slots->node_idx; struct rb_node *tmp; struct kvm_memory_slot *slot; iter->slots = slots; /* * Find the so called "upper bound" of a key - the first node that has * its key strictly greater than the searched one (the start gfn in our case). */ iter->node = NULL; for (tmp = slots->gfn_tree.rb_node; tmp; ) { slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]); if (start < slot->base_gfn) { iter->node = tmp; tmp = tmp->rb_left; } else { tmp = tmp->rb_right; } } /* * Find the slot with the lowest gfn that can possibly intersect with * the range, so we'll ideally have slot start <= range start */ if (iter->node) { /* * A NULL previous node means that the very first slot * already has a higher start gfn. * In this case slot start > range start. */ tmp = rb_prev(iter->node); if (tmp) iter->node = tmp; } else { /* a NULL node below means no slots */ iter->node = rb_last(&slots->gfn_tree); } if (iter->node) { iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]); /* * It is possible in the slot start < range start case that the * found slot ends before or at range start (slot end <= range start) * and so it does not overlap the requested range. * * In such non-overlapping case the next slot (if it exists) will * already have slot start > range start, otherwise the logic above * would have found it instead of the current slot. */ if (iter->slot->base_gfn + iter->slot->npages <= start) kvm_memslot_iter_next(iter); } } static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end) { if (!iter->node) return false; /* * If this slot starts beyond or at the end of the range so does * every next one */ return iter->slot->base_gfn < end; } /* Iterate over each memslot at least partially intersecting [start, end) range */ #define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end) \ for (kvm_memslot_iter_start(iter, slots, start); \ kvm_memslot_iter_is_valid(iter, end); \ kvm_memslot_iter_next(iter)) struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot * - delete an existing memory slot * - modify an existing memory slot * -- move it in the guest physical memory space * -- just change its flags * * Since flags can be changed by some of these operations, the following * differentiation is the best we can do for kvm_set_memory_region(): */ enum kvm_mr_change { KVM_MR_CREATE, KVM_MR_DELETE, KVM_MR_MOVE, KVM_MR_FLAGS_ONLY, }; int kvm_set_internal_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { return __gfn_to_page(kvm, gfn, true); } unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); static inline void kvm_release_page_unused(struct page *page) { if (!page) return; put_page(page); } void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, bool unused, bool dirty) { lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); if (!page) return; /* * If the page that KVM got from the *primary MMU* is writable, and KVM * installed or reused a SPTE, mark the page/folio dirty. Note, this * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if * the GFN is write-protected. Folios can't be safely marked dirty * outside of mmu_lock as doing so could race with writeback on the * folio. As a result, KVM can't mark folios dirty in the fast page * fault handler, and so KVM must (somewhat) speculatively mark the * folio dirty if KVM could locklessly make the SPTE writable. */ if (unused) kvm_release_page_unused(page); else if (dirty) kvm_release_page_dirty(page); else kvm_release_page_clean(page); } kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool write, bool *writable, struct page **refcounted_page) { return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); #define __kvm_get_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = get_user(v, __uaddr); \ __ret; \ }) #define kvm_get_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) #define __kvm_put_guest(kvm, gfn, offset, v) \ ({ \ unsigned long __addr = gfn_to_hva(kvm, gfn); \ typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \ int __ret = -EFAULT; \ \ if (!kvm_is_error_hva(__addr)) \ __ret = put_user(v, __uaddr); \ if (!__ret) \ mark_page_dirty(kvm, gfn); \ __ret; \ }) #define kvm_put_guest(kvm, gpa, v) \ ({ \ gpa_t __gpa = gpa; \ struct kvm *__kvm = kvm; \ \ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ offset_in_page(__gpa), v); \ }) int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, true); } static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map) { return __kvm_vcpu_map(vcpu, gpa, map, false); } unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len); int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @kvm: pointer to kvm instance. * * This sets up a gfn_to_pfn_cache by initializing locks and assigning the * immutable attributes. Note, the cache must be zero-allocated (or zeroed by * the caller before init). */ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); /** * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest * physical address. * * @gpc: struct gfn_to_pfn_cache object. * @gpa: guest physical address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for * invalidations to be processed. Callers are required to use kvm_gpc_check() * to ensure that the cache is valid before accessing the target page. */ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); /** * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA. * * @gpc: struct gfn_to_pfn_cache object. * @hva: userspace virtual address to map. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * The semantics of this function are the same as those of kvm_gpc_activate(). It * merely bypasses a layer of address translation. */ int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len); /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: %true if the cache is still valid and the address matches. * %false if the cache is not valid. * * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock * while calling this function, and then continue to hold the lock until the * access is complete. * * Callers in IN_GUEST_MODE may do so without locking, although they should * still hold a read lock on kvm->scru for the memslot checks. */ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_refresh - update a previously initialized cache. * * @gpc: struct gfn_to_pfn_cache object. * @len: sanity check; the range being access must fit a single page. * * @return: 0 for success. * -EINVAL for a mapping which would cross a page boundary. * -EFAULT for an untranslatable guest physical address. * * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful * return from this function does not mean the page can be immediately * accessed because it may have raced with an invalidation. Callers must * still lock and check the cache status, as this function does not return * with the lock still held to permit access. */ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); /** * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * * This removes a cache from the VM's list to be processed on MMU notifier * invocation. */ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && !kvm_is_error_gpa(gpc->gpa); } static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) { return gpc->active && kvm_is_error_gpa(gpc->gpa); } void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); void kvm_vcpu_halt(struct kvm_vcpu *vcpu); bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, const struct kvm_memory_slot *memslot); #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min); int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc); void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot); #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot); #endif int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr); int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state); #endif #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry); #else static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of * kvm_usage_count, i.e. at the beginning of the generic hardware enabling * sequence, and at the end of the generic hardware disabling sequence. */ void kvm_arch_enable_virtualization(void); void kvm_arch_disable_virtualization(void); /* * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to * do the actual twiddling of hardware bits. The hooks are called on all * online CPUs when KVM enables/disabled virtualization, and on a single CPU * when that CPU is onlined/offlined (including for Resume/Suspend). */ int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* * All architectures that want to use vzalloc currently also * need their own kvm_arch_alloc_vm implementation. */ static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT); } #endif static inline void __kvm_arch_free_vm(struct kvm *kvm) { kvfree(kvm); } #ifndef __KVM_HAVE_ARCH_VM_FREE static inline void kvm_arch_free_vm(struct kvm *kvm) { __kvm_arch_free_vm(kvm); } #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { return -ENOTSUPP; } #else int kvm_arch_flush_remote_tlbs(struct kvm *kvm); #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { return -EOPNOTSUPP; } #else int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); #endif #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA void kvm_arch_register_noncoherent_dma(struct kvm *kvm); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); #else static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { } static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { } static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return false; } #endif #ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE void kvm_arch_start_assignment(struct kvm *kvm); void kvm_arch_end_assignment(struct kvm *kvm); bool kvm_arch_has_assigned_device(struct kvm *kvm); #else static inline void kvm_arch_start_assignment(struct kvm *kvm) { } static inline void kvm_arch_end_assignment(struct kvm *kvm) { } static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) { return false; } #endif static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP return vcpu->arch.waitp; #else return &vcpu->wait; #endif } /* * Wake a vCPU if necessary, but don't do any stats/metadata updates. Returns * true if the vCPU was blocking and was awakened, false otherwise. */ static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); } static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu) { return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu)); } #ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED /* * returns true if the virtual interrupt controller is initialized and * ready to accept virtual IRQ. On some architectures the virtual interrupt * controller is dynamically instantiated and this is not always true. */ bool kvm_arch_intc_initialized(struct kvm *kvm); #else static inline bool kvm_arch_intc_initialized(struct kvm *kvm) { return true; } #endif #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); void kvm_unregister_perf_callbacks(void); #else static inline void kvm_register_perf_callbacks(void *ign) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi); int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin); int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level, bool line_status); int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* * Returns a pointer to the memslot if it contains gfn. * Otherwise returns NULL. */ static inline struct kvm_memory_slot * try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { if (!slot) return NULL; if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) return slot; else return NULL; } /* * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL. * * With "approx" set returns the memslot also when the address falls * in a hole. In that case one of the memslots bordering the hole is * returned. */ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; struct rb_node *node; int idx = slots->node_idx; slot = NULL; for (node = slots->gfn_tree.rb_node; node; ) { slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]); if (gfn >= slot->base_gfn) { if (gfn < slot->base_gfn + slot->npages) return slot; node = node->rb_right; } else node = node->rb_left; } return approx ? slot : NULL; } static inline struct kvm_memory_slot * ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { struct kvm_memory_slot *slot; slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot); slot = try_get_memslot(slot, gfn); if (slot) return slot; slot = search_memslots(slots, gfn, approx); if (slot) { atomic_long_set(&slots->last_used_slot, (unsigned long)slot); return slot; } return NULL; } /* * __gfn_to_memslot() and its descendants are here to allow arch code to inline * the lookups in hot paths. gfn_to_memslot() itself isn't here as an inline * because that would bloat other code too much. */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { return ____gfn_to_memslot(slots, gfn, false); } static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { /* * The index was checked originally in search_memslots. To avoid * that a malicious guest builds a Spectre gadget out of e.g. page * table walks, do not let the processor speculate loads outside * the guest's registered memslots. */ unsigned long offset = gfn - slot->base_gfn; offset = array_index_nospec(offset, slot->npages); return slot->userspace_addr + offset * PAGE_SIZE; } static inline int memslot_id(struct kvm *kvm, gfn_t gfn) { return gfn_to_memslot(kvm, gfn)->id; } static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; return slot->base_gfn + gfn_offset; } static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; } static inline gfn_t gpa_to_gfn(gpa_t gpa) { return (gfn_t)(gpa >> PAGE_SHIFT); } static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; } static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) { unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); return !kvm_is_error_hva(hva); } static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) { lockdep_assert_held(&gpc->lock); if (!gpc->memslot) return; mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); } enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, }; struct kvm_stat_data { struct kvm *kvm; const struct _kvm_stats_desc *desc; enum kvm_stat_kind kind; }; struct _kvm_stats_desc { struct kvm_stats_desc desc; char name[KVM_STATS_NAME_SIZE]; }; #define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ .size = sz, \ .bucket_size = bsz #define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } #define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } #define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } #define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ #define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ unit, base, exponent, 1, 0) #define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ unit, base, exponent, sz, bsz) #define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous counter, read only */ #define STATS_DESC_ICOUNTER(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Peak counter, read/write */ #define STATS_DESC_PCOUNTER(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) /* Instantaneous boolean value, read only */ #define STATS_DESC_IBOOLEAN(SCOPE, name) \ STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Peak (sticky) boolean value, read/write */ #define STATS_DESC_PBOOLEAN(SCOPE, name) \ STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ KVM_STATS_BASE_POW10, 0) /* Cumulative time in nanosecond */ #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) /* Linear histogram for time in nanosecond */ #define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz, bsz) /* Logarithmic histogram for time in nanosecond */ #define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ HALT_POLL_HIST_COUNT), \ STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); /** * kvm_stats_linear_hist_update() - Update bucket value for linear histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the linear histogram's bucket * @bucket_size: the size (width) of a bucket */ static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, u64 value, size_t bucket_size) { size_t index = div64_u64(value, bucket_size); index = min(index, size - 1); ++data[index]; } /** * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram * statistics data. * * @data: start address of the stats data * @size: the number of bucket of the stats data * @value: the new value used to update the logarithmic histogram's bucket */ static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) { size_t index = fls64(value); index = min(index, size - 1); ++data[index]; } #define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) #define KVM_STATS_LOG_HIST_UPDATE(array, value) \ kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) return 1; /* * Ensure the read of mmu_invalidate_in_progress happens before * the read of mmu_invalidate_seq. This interacts with the * smp_wmb() in mmu_notifier_invalidate_range_end to make sure * that the caller either sees the old (non-zero) value of * mmu_invalidate_in_progress or the new (incremented) value of * mmu_invalidate_seq. * * PowerPC Book3s HV KVM calls this under a per-page lock rather * than under kvm->mmu_lock, for scalability, so can't rely on * kvm->mmu_lock to keep things ordered. */ smp_rmb(); if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { lockdep_assert_held(&kvm->mmu_lock); /* * If mmu_invalidate_in_progress is non-zero, then the range maintained * by kvm_mmu_notifier_invalidate_range_start contains all addresses * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ if (unlikely(kvm->mmu_invalidate_in_progress)) { /* * Dropping mmu_lock after bumping mmu_invalidate_in_progress * but before updating the range is a KVM bug. */ if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA || kvm->mmu_invalidate_range_end == INVALID_GPA)) return 1; if (gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return 1; } if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; } /* * This lockless version of the range-based retry check *must* be paired with a * call to the locked version after acquiring mmu_lock, i.e. this is safe to * use only as a pre-check to avoid contending mmu_lock. This version *will* * get false negatives and false positives. */ static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, unsigned long mmu_seq, gfn_t gfn) { /* * Use READ_ONCE() to ensure the in-progress flag and sequence counter * are always read from memory, e.g. so that checking for retry in a * loop won't result in an infinite retry loop. Don't force loads for * start+end, as the key to avoiding infinite retry loops is observing * the 1=>0 transition of in-progress, i.e. getting false negatives * due to stale start+end values is acceptable. */ if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && gfn >= kvm->mmu_invalidate_range_start && gfn < kvm->mmu_invalidate_range_end) return true; return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; } #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */ bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); int kvm_init_irq_routing(struct kvm *kvm); int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); #else static inline void kvm_free_irq_routing(struct kvm *kvm) {} static inline int kvm_init_irq_routing(struct kvm *kvm) { return 0; } #endif int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); #ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin); void kvm_irq_routing_update(struct kvm *); #else static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { return -EINVAL; } static inline void kvm_irqfd_release(struct kvm *kvm) {} static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin) { return false; } #endif /* CONFIG_HAVE_KVM_IRQCHIP */ void kvm_arch_irq_routing_update(struct kvm *kvm); static inline void __kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Ensure the rest of the request is published to kvm_check_request's * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* * Request that don't require vCPU action should never be logged in * vcpu->requests. The vCPU won't clear the request, so it will stay * logged indefinitely and prevent the vCPU from entering the guest. */ BUILD_BUG_ON(!__builtin_constant_p(req) || (req & KVM_REQUEST_NO_ACTION)); __kvm_make_request(req, vcpu); } static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); } static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) { return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) { clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { if (kvm_test_request(req, vcpu)) { kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's * caller. Paired with the smp_wmb in kvm_make_request. */ smp_mb__after_atomic(); return true; } else { return false; } } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; extern unsigned int halt_poll_ns_grow; extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { const struct kvm_device_ops *ops; struct kvm *kvm; void *private; struct list_head vm_node; }; /* create, destroy, and name are mandatory */ struct kvm_device_ops { const char *name; /* * create is called holding kvm->lock and any operations not suitable * to do while holding the lock should be deferred to init (see * below). */ int (*create)(struct kvm_device *dev, u32 type); /* * init is called after create if create is successful and is called * outside of holding kvm->lock. */ void (*init)(struct kvm_device *dev); /* * Destroy is responsible for freeing dev. * * Destroy may be called before or after destructors are called * on emulated I/O regions, depending on whether a reference is * held by a vcpu or other kvm component that gets destroyed * after the emulated I/O. */ void (*destroy)(struct kvm_device *dev); /* * Release is an alternative method to free the device. It is * called when the device file descriptor is closed. Once * release is called, the destroy method will not be called * anymore as the device is removed from the device list of * the VM. kvm->lock is held. */ void (*release)(struct kvm_device *dev); int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, unsigned long arg); int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma); }; struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.in_spin_loop = val; } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.dy_eligible = val; } #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { } static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) { return (memslot && memslot->id < KVM_USER_MEM_SLOTS && !(memslot->flags & KVM_MEMSLOT_INVALID)); } struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS /* If we wakeup during the poll time, was it a sucessful poll? */ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return vcpu->valid_wakeup; } #else static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) { return true; } #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ #ifdef CONFIG_HAVE_KVM_NO_POLL /* Callback that tells if we must not poll */ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu); #else static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return false; } #endif /* CONFIG_HAVE_KVM_NO_POLL */ #ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); #else static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -ENOIOCTLCMD; } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { return 0; } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->stat.signal_exits++; } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ /* * If more than one page is being (un)accounted, @virt must be the address of * the first page of a block of pages what were allocated together (i.e * accounted together). * * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state() * is thread-safe. */ static inline void kvm_account_pgtable_pages(void *virt, int nr) { mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr); } /* * This defines how many reserved entries we want to keep before we * kick the vcpu to the userspace to avoid dirty ring full. This * value can be tuned to higher if e.g. PML is enabled on the host. */ #define KVM_DIRTY_RING_RSVD_ENTRIES 64 /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t size, bool is_write, bool is_exec, bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; if (is_private) vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); } bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long mask, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { return false; } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; } #endif /* CONFIG_KVM_PRIVATE_MEM */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * * @kvm: KVM instance * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration * if not NULL) * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range * @opaque: opaque data to pass to @post_populate callback * * This is primarily intended for cases where a gmem-backed GPA range needs * to be initialized with userspace-provided data prior to being mapped into * the guest as a private page. This should be called with the slots->lock * held so that caller-enforced invariants regarding the expected memory * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES. * * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); #endif #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif #endif |
1226 213 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 | /* SPDX-License-Identifier: GPL-2.0 */ /* * security/tomoyo/common.h * * Header file for TOMOYO. * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #ifndef _SECURITY_TOMOYO_COMMON_H #define _SECURITY_TOMOYO_COMMON_H #define pr_fmt(fmt) fmt #include <linux/ctype.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/list.h> #include <linux/cred.h> #include <linux/poll.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/lsm_hooks.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> /********** Constants definitions. **********/ /* * TOMOYO uses this hash only when appending a string into the string * table. Frequency of appending strings is very low. So we don't need * large (e.g. 64k) hash size. 256 will be sufficient. */ #define TOMOYO_HASH_BITS 8 #define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS) /* * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET. * Therefore, we don't need SOCK_MAX. */ #define TOMOYO_SOCK_MAX 6 #define TOMOYO_EXEC_TMPSIZE 4096 /* Garbage collector is trying to kfree() this element. */ #define TOMOYO_GC_IN_PROGRESS -1 /* Profile number is an integer between 0 and 255. */ #define TOMOYO_MAX_PROFILES 256 /* Group number is an integer between 0 and 255. */ #define TOMOYO_MAX_ACL_GROUPS 256 /* Index numbers for "struct tomoyo_condition". */ enum tomoyo_conditions_index { TOMOYO_TASK_UID, /* current_uid() */ TOMOYO_TASK_EUID, /* current_euid() */ TOMOYO_TASK_SUID, /* current_suid() */ TOMOYO_TASK_FSUID, /* current_fsuid() */ TOMOYO_TASK_GID, /* current_gid() */ TOMOYO_TASK_EGID, /* current_egid() */ TOMOYO_TASK_SGID, /* current_sgid() */ TOMOYO_TASK_FSGID, /* current_fsgid() */ TOMOYO_TASK_PID, /* sys_getpid() */ TOMOYO_TASK_PPID, /* sys_getppid() */ TOMOYO_EXEC_ARGC, /* "struct linux_binprm *"->argc */ TOMOYO_EXEC_ENVC, /* "struct linux_binprm *"->envc */ TOMOYO_TYPE_IS_SOCKET, /* S_IFSOCK */ TOMOYO_TYPE_IS_SYMLINK, /* S_IFLNK */ TOMOYO_TYPE_IS_FILE, /* S_IFREG */ TOMOYO_TYPE_IS_BLOCK_DEV, /* S_IFBLK */ TOMOYO_TYPE_IS_DIRECTORY, /* S_IFDIR */ TOMOYO_TYPE_IS_CHAR_DEV, /* S_IFCHR */ TOMOYO_TYPE_IS_FIFO, /* S_IFIFO */ TOMOYO_MODE_SETUID, /* S_ISUID */ TOMOYO_MODE_SETGID, /* S_ISGID */ TOMOYO_MODE_STICKY, /* S_ISVTX */ TOMOYO_MODE_OWNER_READ, /* S_IRUSR */ TOMOYO_MODE_OWNER_WRITE, /* S_IWUSR */ TOMOYO_MODE_OWNER_EXECUTE, /* S_IXUSR */ TOMOYO_MODE_GROUP_READ, /* S_IRGRP */ TOMOYO_MODE_GROUP_WRITE, /* S_IWGRP */ TOMOYO_MODE_GROUP_EXECUTE, /* S_IXGRP */ TOMOYO_MODE_OTHERS_READ, /* S_IROTH */ TOMOYO_MODE_OTHERS_WRITE, /* S_IWOTH */ TOMOYO_MODE_OTHERS_EXECUTE, /* S_IXOTH */ TOMOYO_EXEC_REALPATH, TOMOYO_SYMLINK_TARGET, TOMOYO_PATH1_UID, TOMOYO_PATH1_GID, TOMOYO_PATH1_INO, TOMOYO_PATH1_MAJOR, TOMOYO_PATH1_MINOR, TOMOYO_PATH1_PERM, TOMOYO_PATH1_TYPE, TOMOYO_PATH1_DEV_MAJOR, TOMOYO_PATH1_DEV_MINOR, TOMOYO_PATH2_UID, TOMOYO_PATH2_GID, TOMOYO_PATH2_INO, TOMOYO_PATH2_MAJOR, TOMOYO_PATH2_MINOR, TOMOYO_PATH2_PERM, TOMOYO_PATH2_TYPE, TOMOYO_PATH2_DEV_MAJOR, TOMOYO_PATH2_DEV_MINOR, TOMOYO_PATH1_PARENT_UID, TOMOYO_PATH1_PARENT_GID, TOMOYO_PATH1_PARENT_INO, TOMOYO_PATH1_PARENT_PERM, TOMOYO_PATH2_PARENT_UID, TOMOYO_PATH2_PARENT_GID, TOMOYO_PATH2_PARENT_INO, TOMOYO_PATH2_PARENT_PERM, TOMOYO_MAX_CONDITION_KEYWORD, TOMOYO_NUMBER_UNION, TOMOYO_NAME_UNION, TOMOYO_ARGV_ENTRY, TOMOYO_ENVP_ENTRY, }; /* Index numbers for stat(). */ enum tomoyo_path_stat_index { /* Do not change this order. */ TOMOYO_PATH1, TOMOYO_PATH1_PARENT, TOMOYO_PATH2, TOMOYO_PATH2_PARENT, TOMOYO_MAX_PATH_STAT }; /* Index numbers for operation mode. */ enum tomoyo_mode_index { TOMOYO_CONFIG_DISABLED, TOMOYO_CONFIG_LEARNING, TOMOYO_CONFIG_PERMISSIVE, TOMOYO_CONFIG_ENFORCING, TOMOYO_CONFIG_MAX_MODE, TOMOYO_CONFIG_WANT_REJECT_LOG = 64, TOMOYO_CONFIG_WANT_GRANT_LOG = 128, TOMOYO_CONFIG_USE_DEFAULT = 255, }; /* Index numbers for entry type. */ enum tomoyo_policy_id { TOMOYO_ID_GROUP, TOMOYO_ID_ADDRESS_GROUP, TOMOYO_ID_PATH_GROUP, TOMOYO_ID_NUMBER_GROUP, TOMOYO_ID_TRANSITION_CONTROL, TOMOYO_ID_AGGREGATOR, TOMOYO_ID_MANAGER, TOMOYO_ID_CONDITION, TOMOYO_ID_NAME, TOMOYO_ID_ACL, TOMOYO_ID_DOMAIN, TOMOYO_MAX_POLICY }; /* Index numbers for domain's attributes. */ enum tomoyo_domain_info_flags_index { /* Quota warnning flag. */ TOMOYO_DIF_QUOTA_WARNED, /* * This domain was unable to create a new domain at * tomoyo_find_next_domain() because the name of the domain to be * created was too long or it could not allocate memory. * More than one process continued execve() without domain transition. */ TOMOYO_DIF_TRANSITION_FAILED, TOMOYO_MAX_DOMAIN_INFO_FLAGS }; /* Index numbers for audit type. */ enum tomoyo_grant_log { /* Follow profile's configuration. */ TOMOYO_GRANTLOG_AUTO, /* Do not generate grant log. */ TOMOYO_GRANTLOG_NO, /* Generate grant_log. */ TOMOYO_GRANTLOG_YES, }; /* Index numbers for group entries. */ enum tomoyo_group_id { TOMOYO_PATH_GROUP, TOMOYO_NUMBER_GROUP, TOMOYO_ADDRESS_GROUP, TOMOYO_MAX_GROUP }; /* Index numbers for type of numeric values. */ enum tomoyo_value_type { TOMOYO_VALUE_TYPE_INVALID, TOMOYO_VALUE_TYPE_DECIMAL, TOMOYO_VALUE_TYPE_OCTAL, TOMOYO_VALUE_TYPE_HEXADECIMAL, }; /* Index numbers for domain transition control keywords. */ enum tomoyo_transition_type { /* Do not change this order, */ TOMOYO_TRANSITION_CONTROL_NO_RESET, TOMOYO_TRANSITION_CONTROL_RESET, TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE, TOMOYO_TRANSITION_CONTROL_INITIALIZE, TOMOYO_TRANSITION_CONTROL_NO_KEEP, TOMOYO_TRANSITION_CONTROL_KEEP, TOMOYO_MAX_TRANSITION_TYPE }; /* Index numbers for Access Controls. */ enum tomoyo_acl_entry_type_index { TOMOYO_TYPE_PATH_ACL, TOMOYO_TYPE_PATH2_ACL, TOMOYO_TYPE_PATH_NUMBER_ACL, TOMOYO_TYPE_MKDEV_ACL, TOMOYO_TYPE_MOUNT_ACL, TOMOYO_TYPE_INET_ACL, TOMOYO_TYPE_UNIX_ACL, TOMOYO_TYPE_ENV_ACL, TOMOYO_TYPE_MANUAL_TASK_ACL, }; /* Index numbers for access controls with one pathname. */ enum tomoyo_path_acl_index { TOMOYO_TYPE_EXECUTE, TOMOYO_TYPE_READ, TOMOYO_TYPE_WRITE, TOMOYO_TYPE_APPEND, TOMOYO_TYPE_UNLINK, TOMOYO_TYPE_GETATTR, TOMOYO_TYPE_RMDIR, TOMOYO_TYPE_TRUNCATE, TOMOYO_TYPE_SYMLINK, TOMOYO_TYPE_CHROOT, TOMOYO_TYPE_UMOUNT, TOMOYO_MAX_PATH_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_memory_stat_type { TOMOYO_MEMORY_POLICY, TOMOYO_MEMORY_AUDIT, TOMOYO_MEMORY_QUERY, TOMOYO_MAX_MEMORY_STAT }; enum tomoyo_mkdev_acl_index { TOMOYO_TYPE_MKBLOCK, TOMOYO_TYPE_MKCHAR, TOMOYO_MAX_MKDEV_OPERATION }; /* Index numbers for socket operations. */ enum tomoyo_network_acl_index { TOMOYO_NETWORK_BIND, /* bind() operation. */ TOMOYO_NETWORK_LISTEN, /* listen() operation. */ TOMOYO_NETWORK_CONNECT, /* connect() operation. */ TOMOYO_NETWORK_SEND, /* send() operation. */ TOMOYO_MAX_NETWORK_OPERATION }; /* Index numbers for access controls with two pathnames. */ enum tomoyo_path2_acl_index { TOMOYO_TYPE_LINK, TOMOYO_TYPE_RENAME, TOMOYO_TYPE_PIVOT_ROOT, TOMOYO_MAX_PATH2_OPERATION }; /* Index numbers for access controls with one pathname and one number. */ enum tomoyo_path_number_acl_index { TOMOYO_TYPE_CREATE, TOMOYO_TYPE_MKDIR, TOMOYO_TYPE_MKFIFO, TOMOYO_TYPE_MKSOCK, TOMOYO_TYPE_IOCTL, TOMOYO_TYPE_CHMOD, TOMOYO_TYPE_CHOWN, TOMOYO_TYPE_CHGRP, TOMOYO_MAX_PATH_NUMBER_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */ enum tomoyo_securityfs_interface_index { TOMOYO_DOMAINPOLICY, TOMOYO_EXCEPTIONPOLICY, TOMOYO_PROCESS_STATUS, TOMOYO_STAT, TOMOYO_AUDIT, TOMOYO_VERSION, TOMOYO_PROFILE, TOMOYO_QUERY, TOMOYO_MANAGER }; /* Index numbers for special mount operations. */ enum tomoyo_special_mount { TOMOYO_MOUNT_BIND, /* mount --bind /source /dest */ TOMOYO_MOUNT_MOVE, /* mount --move /old /new */ TOMOYO_MOUNT_REMOUNT, /* mount -o remount /dir */ TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */ TOMOYO_MOUNT_MAKE_PRIVATE, /* mount --make-private /dir */ TOMOYO_MOUNT_MAKE_SLAVE, /* mount --make-slave /dir */ TOMOYO_MOUNT_MAKE_SHARED, /* mount --make-shared /dir */ TOMOYO_MAX_SPECIAL_MOUNT }; /* Index numbers for functionality. */ enum tomoyo_mac_index { TOMOYO_MAC_FILE_EXECUTE, TOMOYO_MAC_FILE_OPEN, TOMOYO_MAC_FILE_CREATE, TOMOYO_MAC_FILE_UNLINK, TOMOYO_MAC_FILE_GETATTR, TOMOYO_MAC_FILE_MKDIR, TOMOYO_MAC_FILE_RMDIR, TOMOYO_MAC_FILE_MKFIFO, TOMOYO_MAC_FILE_MKSOCK, TOMOYO_MAC_FILE_TRUNCATE, TOMOYO_MAC_FILE_SYMLINK, TOMOYO_MAC_FILE_MKBLOCK, TOMOYO_MAC_FILE_MKCHAR, TOMOYO_MAC_FILE_LINK, TOMOYO_MAC_FILE_RENAME, TOMOYO_MAC_FILE_CHMOD, TOMOYO_MAC_FILE_CHOWN, TOMOYO_MAC_FILE_CHGRP, TOMOYO_MAC_FILE_IOCTL, TOMOYO_MAC_FILE_CHROOT, TOMOYO_MAC_FILE_MOUNT, TOMOYO_MAC_FILE_UMOUNT, TOMOYO_MAC_FILE_PIVOT_ROOT, TOMOYO_MAC_NETWORK_INET_STREAM_BIND, TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, TOMOYO_MAC_NETWORK_INET_RAW_BIND, TOMOYO_MAC_NETWORK_INET_RAW_SEND, TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, TOMOYO_MAC_ENVIRON, TOMOYO_MAX_MAC_INDEX }; /* Index numbers for category of functionality. */ enum tomoyo_mac_category_index { TOMOYO_MAC_CATEGORY_FILE, TOMOYO_MAC_CATEGORY_NETWORK, TOMOYO_MAC_CATEGORY_MISC, TOMOYO_MAX_MAC_CATEGORY_INDEX }; /* * Retry this request. Returned by tomoyo_supervisor() if policy violation has * occurred in enforcing mode and the userspace daemon decided to retry. * * We must choose a positive value in order to distinguish "granted" (which is * 0) and "rejected" (which is a negative value) and "retry". */ #define TOMOYO_RETRY_REQUEST 1 /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_policy_stat_type { /* Do not change this order. */ TOMOYO_STAT_POLICY_UPDATES, TOMOYO_STAT_POLICY_LEARNING, /* == TOMOYO_CONFIG_LEARNING */ TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */ TOMOYO_STAT_POLICY_ENFORCING, /* == TOMOYO_CONFIG_ENFORCING */ TOMOYO_MAX_POLICY_STAT }; /* Index numbers for profile's PREFERENCE values. */ enum tomoyo_pref_index { TOMOYO_PREF_MAX_AUDIT_LOG, TOMOYO_PREF_MAX_LEARNING_ENTRY, TOMOYO_MAX_PREF }; /********** Structure definitions. **********/ /* Common header for holding ACL entries. */ struct tomoyo_acl_head { struct list_head list; s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ } __packed; /* Common header for shared entries. */ struct tomoyo_shared_acl_head { struct list_head list; atomic_t users; } __packed; struct tomoyo_policy_namespace; /* Structure for request info. */ struct tomoyo_request_info { /* * For holding parameters specific to operations which deal files. * NULL if not dealing files. */ struct tomoyo_obj_info *obj; /* * For holding parameters specific to execve() request. * NULL if not dealing execve(). */ struct tomoyo_execve *ee; struct tomoyo_domain_info *domain; /* For holding parameters. */ union { struct { const struct tomoyo_path_info *filename; /* For using wildcards at tomoyo_find_next_domain(). */ const struct tomoyo_path_info *matched_path; /* One of values in "enum tomoyo_path_acl_index". */ u8 operation; } path; struct { const struct tomoyo_path_info *filename1; const struct tomoyo_path_info *filename2; /* One of values in "enum tomoyo_path2_acl_index". */ u8 operation; } path2; struct { const struct tomoyo_path_info *filename; unsigned int mode; unsigned int major; unsigned int minor; /* One of values in "enum tomoyo_mkdev_acl_index". */ u8 operation; } mkdev; struct { const struct tomoyo_path_info *filename; unsigned long number; /* * One of values in * "enum tomoyo_path_number_acl_index". */ u8 operation; } path_number; struct { const struct tomoyo_path_info *name; } environ; struct { const __be32 *address; u16 port; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; bool is_ipv6; } inet_network; struct { const struct tomoyo_path_info *address; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; } unix_network; struct { const struct tomoyo_path_info *type; const struct tomoyo_path_info *dir; const struct tomoyo_path_info *dev; unsigned long flags; int need_dev; } mount; struct { const struct tomoyo_path_info *domainname; } task; } param; struct tomoyo_acl_info *matched_acl; u8 param_type; bool granted; u8 retry; u8 profile; u8 mode; /* One of tomoyo_mode_index . */ u8 type; }; /* Structure for holding a token. */ struct tomoyo_path_info { const char *name; u32 hash; /* = full_name_hash(name, strlen(name)) */ u16 const_len; /* = tomoyo_const_part_length(name) */ bool is_dir; /* = tomoyo_strendswith(name, "/") */ bool is_patterned; /* = tomoyo_path_contains_pattern(name) */ }; /* Structure for holding string data. */ struct tomoyo_name { struct tomoyo_shared_acl_head head; struct tomoyo_path_info entry; }; /* Structure for holding a word. */ struct tomoyo_name_union { /* Either @filename or @group is NULL. */ const struct tomoyo_path_info *filename; struct tomoyo_group *group; }; /* Structure for holding a number. */ struct tomoyo_number_union { unsigned long values[2]; struct tomoyo_group *group; /* Maybe NULL. */ /* One of values in "enum tomoyo_value_type". */ u8 value_type[2]; }; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union { struct in6_addr ip[2]; /* Big endian. */ struct tomoyo_group *group; /* Pointer to address group. */ bool is_ipv6; /* Valid only if @group == NULL. */ }; /* Structure for "path_group"/"number_group"/"address_group" directive. */ struct tomoyo_group { struct tomoyo_shared_acl_head head; const struct tomoyo_path_info *group_name; struct list_head member_list; }; /* Structure for "path_group" directive. */ struct tomoyo_path_group { struct tomoyo_acl_head head; const struct tomoyo_path_info *member_name; }; /* Structure for "number_group" directive. */ struct tomoyo_number_group { struct tomoyo_acl_head head; struct tomoyo_number_union number; }; /* Structure for "address_group" directive. */ struct tomoyo_address_group { struct tomoyo_acl_head head; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union address; }; /* Subset of "struct stat". Used by conditional ACL and audit logs. */ struct tomoyo_mini_stat { kuid_t uid; kgid_t gid; ino_t ino; umode_t mode; dev_t dev; dev_t rdev; }; /* Structure for dumping argv[] and envp[] of "struct linux_binprm". */ struct tomoyo_page_dump { struct page *page; /* Previously dumped page. */ char *data; /* Contents of "page". Size is PAGE_SIZE. */ }; /* Structure for attribute checks in addition to pathname checks. */ struct tomoyo_obj_info { /* * True if tomoyo_get_attributes() was already called, false otherwise. */ bool validate_done; /* True if @stat[] is valid. */ bool stat_valid[TOMOYO_MAX_PATH_STAT]; /* First pathname. Initialized with { NULL, NULL } if no path. */ struct path path1; /* Second pathname. Initialized with { NULL, NULL } if no path. */ struct path path2; /* * Information on @path1, @path1's parent directory, @path2, @path2's * parent directory. */ struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT]; /* * Content of symbolic link to be created. NULL for operations other * than symlink(). */ struct tomoyo_path_info *symlink_target; }; /* Structure for argv[]. */ struct tomoyo_argv { unsigned long index; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for envp[]. */ struct tomoyo_envp { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for execve() operation. */ struct tomoyo_execve { struct tomoyo_request_info r; struct tomoyo_obj_info obj; struct linux_binprm *bprm; const struct tomoyo_path_info *transition; /* For dumping argv[] and envp[]. */ struct tomoyo_page_dump dump; /* For temporary use. */ char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ }; /* Structure for entries which follows "struct tomoyo_condition". */ struct tomoyo_condition_element { /* * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail * of the array of this struct. */ u8 left; /* * Right hand operand. A "struct tomoyo_number_union" for * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for * TOMOYO_NAME_UNION is attached to the tail of the array of this * struct. */ u8 right; /* Equation operator. True if equals or overlaps, false otherwise. */ bool equals; }; /* Structure for optional arguments. */ struct tomoyo_condition { struct tomoyo_shared_acl_head head; u32 size; /* Memory size allocated for this entry. */ u16 condc; /* Number of conditions in this struct. */ u16 numbers_count; /* Number of "struct tomoyo_number_union values". */ u16 names_count; /* Number of "struct tomoyo_name_union names". */ u16 argc; /* Number of "struct tomoyo_argv". */ u16 envc; /* Number of "struct tomoyo_envp". */ u8 grant_log; /* One of values in "enum tomoyo_grant_log". */ const struct tomoyo_path_info *transit; /* Maybe NULL. */ /* * struct tomoyo_condition_element condition[condc]; * struct tomoyo_number_union values[numbers_count]; * struct tomoyo_name_union names[names_count]; * struct tomoyo_argv argv[argc]; * struct tomoyo_envp envp[envc]; */ }; /* Common header for individual entries. */ struct tomoyo_acl_info { struct list_head list; struct tomoyo_condition *cond; /* Maybe NULL. */ s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */ } __packed; /* Structure for domain information. */ struct tomoyo_domain_info { struct list_head list; struct list_head acl_info_list; /* Name of this domain. Never NULL. */ const struct tomoyo_path_info *domainname; /* Namespace for this domain. Never NULL. */ struct tomoyo_policy_namespace *ns; /* Group numbers to use. */ unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG]; u8 profile; /* Profile number to use. */ bool is_deleted; /* Delete flag. */ bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; atomic_t users; /* Number of referring tasks. */ }; /* * Structure for "task manual_domain_transition" directive. */ struct tomoyo_task_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */ /* Pointer to domainname. */ const struct tomoyo_path_info *domainname; }; /* * Structure for "file execute", "file read", "file write", "file append", * "file unlink", "file getattr", "file rmdir", "file truncate", * "file symlink", "file chroot" and "file unmount" directive. */ struct tomoyo_path_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */ u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */ struct tomoyo_name_union name; }; /* * Structure for "file create", "file mkdir", "file mkfifo", "file mksock", * "file ioctl", "file chmod", "file chown" and "file chgrp" directive. */ struct tomoyo_path_number_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */ /* Bitmask of values in "enum tomoyo_path_number_acl_index". */ u8 perm; struct tomoyo_name_union name; struct tomoyo_number_union number; }; /* Structure for "file mkblock" and "file mkchar" directive. */ struct tomoyo_mkdev_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */ struct tomoyo_name_union name; struct tomoyo_number_union mode; struct tomoyo_number_union major; struct tomoyo_number_union minor; }; /* * Structure for "file rename", "file link" and "file pivot_root" directive. */ struct tomoyo_path2_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */ struct tomoyo_name_union name1; struct tomoyo_name_union name2; }; /* Structure for "file mount" directive. */ struct tomoyo_mount_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */ struct tomoyo_name_union dev_name; struct tomoyo_name_union dir_name; struct tomoyo_name_union fs_type; struct tomoyo_number_union flags; }; /* Structure for "misc env" directive in domain policy. */ struct tomoyo_env_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_ENV_ACL */ const struct tomoyo_path_info *env; /* environment variable */ }; /* Structure for "network inet" directive. */ struct tomoyo_inet_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_ipaddr_union address; struct tomoyo_number_union port; }; /* Structure for "network unix" directive. */ struct tomoyo_unix_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_name_union name; }; /* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */ struct tomoyo_acl_param { char *data; struct list_head *list; struct tomoyo_policy_namespace *ns; bool is_delete; }; #define TOMOYO_MAX_IO_READ_QUEUE 64 /* * Structure for reading/writing policy via /sys/kernel/security/tomoyo * interfaces. */ struct tomoyo_io_buffer { void (*read)(struct tomoyo_io_buffer *head); int (*write)(struct tomoyo_io_buffer *head); __poll_t (*poll)(struct file *file, poll_table *wait); /* Exclusive lock for this structure. */ struct mutex io_sem; char __user *read_user_buf; size_t read_user_buf_avail; struct { struct list_head *ns; struct list_head *domain; struct list_head *group; struct list_head *acl; size_t avail; unsigned int step; unsigned int query_index; u16 index; u16 cond_index; u8 acl_group_index; u8 cond_step; u8 bit; u8 w_pos; bool eof; bool print_this_domain_only; bool print_transition_related_only; bool print_cond_part; const char *w[TOMOYO_MAX_IO_READ_QUEUE]; } r; struct { struct tomoyo_policy_namespace *ns; /* The position currently writing to. */ struct tomoyo_domain_info *domain; /* Bytes available for writing. */ size_t avail; bool is_delete; } w; /* Buffer for reading. */ char *read_buf; /* Size of read buffer. */ size_t readbuf_size; /* Buffer for writing. */ char *write_buf; /* Size of write buffer. */ size_t writebuf_size; /* Type of this interface. */ enum tomoyo_securityfs_interface_index type; /* Users counter protected by tomoyo_io_buffer_list_lock. */ u8 users; /* List for telling GC not to kfree() elements. */ struct list_head list; }; /* * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/ * "no_keep_domain" keyword. */ struct tomoyo_transition_control { struct tomoyo_acl_head head; u8 type; /* One of values in "enum tomoyo_transition_type". */ /* True if the domainname is tomoyo_get_last_name(). */ bool is_last_name; const struct tomoyo_path_info *domainname; /* Maybe NULL */ const struct tomoyo_path_info *program; /* Maybe NULL */ }; /* Structure for "aggregator" keyword. */ struct tomoyo_aggregator { struct tomoyo_acl_head head; const struct tomoyo_path_info *original_name; const struct tomoyo_path_info *aggregated_name; }; /* Structure for policy manager. */ struct tomoyo_manager { struct tomoyo_acl_head head; /* A path to program or a domainname. */ const struct tomoyo_path_info *manager; }; struct tomoyo_preference { unsigned int learning_max_entry; bool enforcing_verbose; bool learning_verbose; bool permissive_verbose; }; /* Structure for /sys/kernel/security/tomnoyo/profile interface. */ struct tomoyo_profile { const struct tomoyo_path_info *comment; struct tomoyo_preference *learning; struct tomoyo_preference *permissive; struct tomoyo_preference *enforcing; struct tomoyo_preference preference; u8 default_config; u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; unsigned int pref[TOMOYO_MAX_PREF]; }; /* Structure for representing YYYY/MM/DD hh/mm/ss. */ struct tomoyo_time { u16 year; u8 month; u8 day; u8 hour; u8 min; u8 sec; }; /* Structure for policy namespace. */ struct tomoyo_policy_namespace { /* Profile table. Memory is allocated as needed. */ struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES]; /* List of "struct tomoyo_group". */ struct list_head group_list[TOMOYO_MAX_GROUP]; /* List of policy. */ struct list_head policy_list[TOMOYO_MAX_POLICY]; /* The global ACL referred by "use_group" keyword. */ struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS]; /* List for connecting to tomoyo_namespace_list list. */ struct list_head namespace_list; /* Profile version. Currently only 20150505 is defined. */ unsigned int profile_version; /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */ const char *name; }; /* Structure for "struct task_struct"->security. */ struct tomoyo_task { struct tomoyo_domain_info *domain_info; struct tomoyo_domain_info *old_domain_info; }; /********** Function prototypes. **********/ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, const struct tomoyo_group *group); bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr); bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond); bool tomoyo_correct_domain(const unsigned char *domainname); bool tomoyo_correct_path(const char *filename); bool tomoyo_correct_word(const char *string); bool tomoyo_domain_def(const unsigned char *buffer); bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r); bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump); bool tomoyo_memory_ok(void *ptr); bool tomoyo_number_matches_group(const unsigned long min, const unsigned long max, const struct tomoyo_group *group); bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr); bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr); bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr); bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern); bool tomoyo_permstr(const char *string, const char *keyword); bool tomoyo_str_starts(char **src, const char *find); char *tomoyo_encode(const char *str); char *tomoyo_encode2(const char *str, int str_len); char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); char *tomoyo_read_token(struct tomoyo_acl_param *param); char *tomoyo_realpath_from_path(const struct path *path); char *tomoyo_realpath_nofollow(const char *pathname); const char *tomoyo_get_exe(void); const struct tomoyo_path_info *tomoyo_compare_name_union (const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr); const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param); const struct tomoyo_path_info *tomoyo_get_name(const char *name); const struct tomoyo_path_info *tomoyo_path_matches_group (const struct tomoyo_path_info *pathname, const struct tomoyo_group *group); int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag); void tomoyo_close_control(struct tomoyo_io_buffer *head); int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env); int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename); int tomoyo_find_next_domain(struct linux_binprm *bprm); int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index); int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index); int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev); int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page); int tomoyo_open_control(const u8 type, struct file *file); int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2); int tomoyo_path_number_perm(const u8 operation, const struct path *path, unsigned long number); int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target); __poll_t tomoyo_poll_control(struct file *file, poll_table *wait); __poll_t tomoyo_poll_log(struct file *file, poll_table *wait); int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_listen_permission(struct socket *sock); int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size); int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate) (struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)); int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)); int tomoyo_write_aggregator(struct tomoyo_acl_param *param); int tomoyo_write_file(struct tomoyo_acl_param *param); int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_misc(struct tomoyo_acl_param *param); int tomoyo_write_inet_network(struct tomoyo_acl_param *param); int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_unix_network(struct tomoyo_acl_param *param); ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len); ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len); struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param); struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit); struct tomoyo_domain_info *tomoyo_domain(void); struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, const u8 idx); struct tomoyo_policy_namespace *tomoyo_assign_namespace (const char *domainname); struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile); u8 tomoyo_parse_ulong(unsigned long *result, char **str); void *tomoyo_commit_ok(void *data, const unsigned int size); void __init tomoyo_load_builtin_policy(void); void __init tomoyo_mm_init(void); void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)); void tomoyo_check_profile(void); void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); void tomoyo_del_condition(struct list_head *element); void tomoyo_fill_path_info(struct tomoyo_path_info *ptr); void tomoyo_get_attributes(struct tomoyo_obj_info *obj); void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns); void tomoyo_load_policy(const char *filename); void tomoyo_normalize_line(unsigned char *buffer); void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register); void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr); void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type); void tomoyo_put_name_union(struct tomoyo_name_union *ptr); void tomoyo_put_number_union(struct tomoyo_number_union *ptr); void tomoyo_read_log(struct tomoyo_io_buffer *head); void tomoyo_update_stat(const u8 index); void tomoyo_warn_oom(const char *function); void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); /********** External variable definitions. **********/ extern bool tomoyo_policy_loaded; extern int tomoyo_enabled; extern const char * const tomoyo_condition_keyword [TOMOYO_MAX_CONDITION_KEYWORD]; extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE]; extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION]; extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX]; extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION]; extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX]; extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION]; extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION]; extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION]; extern struct list_head tomoyo_condition_list; extern struct list_head tomoyo_domain_list; extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH]; extern struct list_head tomoyo_namespace_list; extern struct mutex tomoyo_policy_lock; extern struct srcu_struct tomoyo_ss; extern struct tomoyo_domain_info tomoyo_kernel_domain; extern struct tomoyo_policy_namespace tomoyo_kernel_namespace; extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; extern struct lsm_blob_sizes tomoyo_blob_sizes; /********** Inlined functions. **********/ /** * tomoyo_read_lock - Take lock for protecting policy. * * Returns index number for tomoyo_read_unlock(). */ static inline int tomoyo_read_lock(void) { return srcu_read_lock(&tomoyo_ss); } /** * tomoyo_read_unlock - Release lock for protecting policy. * * @idx: Index number returned by tomoyo_read_lock(). * * Returns nothing. */ static inline void tomoyo_read_unlock(int idx) { srcu_read_unlock(&tomoyo_ss, idx); } /** * tomoyo_sys_getppid - Copy of getppid(). * * Returns parent process's PID. * * Alpha does not have getppid() defined. To be able to build this module on * Alpha, I have to copy getppid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getppid(void) { pid_t pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } /** * tomoyo_sys_getpid - Copy of getpid(). * * Returns current thread's PID. * * Alpha does not have getpid() defined. To be able to build this module on * Alpha, I have to copy getpid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getpid(void) { return task_tgid_vnr(current); } /** * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure. * * @a: Pointer to "struct tomoyo_path_info". * @b: Pointer to "struct tomoyo_path_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a, const struct tomoyo_path_info *b) { return a->hash != b->hash || strcmp(a->name, b->name); } /** * tomoyo_put_name - Drop reference on "struct tomoyo_name". * * @name: Pointer to "struct tomoyo_path_info". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_name(const struct tomoyo_path_info *name) { if (name) { struct tomoyo_name *ptr = container_of(name, typeof(*ptr), entry); atomic_dec(&ptr->head.users); } } /** * tomoyo_put_condition - Drop reference on "struct tomoyo_condition". * * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_condition(struct tomoyo_condition *cond) { if (cond) atomic_dec(&cond->head.users); } /** * tomoyo_put_group - Drop reference on "struct tomoyo_group". * * @group: Pointer to "struct tomoyo_group". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_group(struct tomoyo_group *group) { if (group) atomic_dec(&group->head.users); } /** * tomoyo_task - Get "struct tomoyo_task" for specified thread. * * @task - Pointer to "struct task_struct". * * Returns pointer to "struct tomoyo_task" for specified thread. */ static inline struct tomoyo_task *tomoyo_task(struct task_struct *task) { return task->security + tomoyo_blob_sizes.lbs_task; } /** * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry. * * @a: Pointer to "struct tomoyo_name_union". * @b: Pointer to "struct tomoyo_name_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_name_union (const struct tomoyo_name_union *a, const struct tomoyo_name_union *b) { return a->filename == b->filename && a->group == b->group; } /** * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry. * * @a: Pointer to "struct tomoyo_number_union". * @b: Pointer to "struct tomoyo_number_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_number_union (const struct tomoyo_number_union *a, const struct tomoyo_number_union *b) { return a->values[0] == b->values[0] && a->values[1] == b->values[1] && a->group == b->group && a->value_type[0] == b->value_type[0] && a->value_type[1] == b->value_type[1]; } /** * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry. * * @a: Pointer to "struct tomoyo_ipaddr_union". * @b: Pointer to "struct tomoyo_ipaddr_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_ipaddr_union (const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b) { return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group && a->is_ipv6 == b->is_ipv6; } /** * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread. * * Returns pointer to "struct tomoyo_policy_namespace" for current thread. */ static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void) { return tomoyo_domain()->ns; } /** * list_for_each_cookie - iterate over a list with cookie. * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_cookie(pos, head) \ if (!pos) \ pos = srcu_dereference((head)->next, &tomoyo_ss); \ for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss)) #endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->pid = fl ? fl->c.flc_pid : 0; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lease *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->break_time = fl ? fl->fl_break_time : 0; __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->break_time, __entry->downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->c.flc_flags; __entry->l_fl_type = lease->c.flc_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->c.flc_flags; __entry->b_fl_type = breaker->c.flc_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
357 221 295 329 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | /* SPDX-License-Identifier: GPL-2.0 */ /* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; u32 len; /* length of string in bytes */ struct mls_range range; char *str; /* string representation if context cannot be mapped. */ }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the high level of 'src'. */ static inline int mls_context_cpy_high(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_glblub(struct context *dst, const struct context *c1, const struct context *c2) { struct mls_range *dr = &dst->range; const struct mls_range *r1 = &c1->range, *r2 = &c2->range; int rc = 0; if (r1->level[1].sens < r2->level[0].sens || r2->level[1].sens < r1->level[0].sens) /* These ranges have no common sensitivities */ return -EINVAL; /* Take the greatest of the low */ dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens); /* Take the least of the high */ dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens); rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat, &r2->level[0].cat); if (rc) goto out; rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat, &r2->level[1].cat); if (rc) goto out; out: return rc; } static inline bool mls_context_equal(const struct context *c1, const struct context *c2) { return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_equal(&c1->range.level[0].cat, &c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_equal(&c1->range.level[1].cat, &c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, const struct context *src) { int rc; dst->user = src->user; dst->role = src->role; dst->type = src->type; if (src->str) { dst->str = kstrdup(src->str, GFP_ATOMIC); if (!dst->str) return -ENOMEM; dst->len = src->len; } else { dst->str = NULL; dst->len = 0; } rc = mls_context_cpy(dst, src); if (rc) { kfree(dst->str); dst->str = NULL; dst->len = 0; return rc; } return 0; } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; kfree(c->str); c->str = NULL; c->len = 0; mls_context_destroy(c); } static inline bool context_equal(const struct context *c1, const struct context *c2) { if (c1->len && c2->len) return (c1->len == c2->len && !strcmp(c1->str, c2->str)); if (c1->len || c2->len) return 0; return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_equal(c1, c2)); } u32 context_compute_hash(const struct context *c); #endif /* _SS_CONTEXT_H_ */ |
3 3 3 49 49 46 49 48 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/handle_exit.c: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/esr.h> #include <asm/exception.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/debug-monitors.h> #include <asm/stacktrace/nvhe.h> #include <asm/traps.h> #include <kvm/arm_hypercalls.h> #define CREATE_TRACE_POINTS #include "trace_handle_exit.h" typedef int (*exit_handle_fn)(struct kvm_vcpu *); static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr) { if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr)) kvm_inject_vabt(vcpu); } static int handle_hvc(struct kvm_vcpu *vcpu) { trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0), kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */ if (vcpu_has_nv(vcpu)) { if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD) kvm_inject_undefined(vcpu); else kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } return kvm_smccc_call_handler(vcpu); } static int handle_smc(struct kvm_vcpu *vcpu) { /* * Forward this trapped smc instruction to the virtual EL2 if * the guest has asked for it. */ if (forward_smc_trap(vcpu)) return 1; /* * "If an SMC instruction executed at Non-secure EL1 is * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a * Trap exception, not a Secure Monitor Call exception [...]" * * We need to advance the PC after the trap, as it would * otherwise return to the same address. Furthermore, pre-incrementing * the PC before potentially exiting to userspace maintains the same * abstraction for both SMCs and HVCs. */ kvm_incr_pc(vcpu); /* * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9 * "SMC and HVC immediate value". */ if (kvm_vcpu_hvc_get_imm(vcpu)) { vcpu_set_reg(vcpu, 0, ~0UL); return 1; } /* * If imm is zero then it is likely an SMCCC call. * * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than * being treated as UNDEFINED. */ return kvm_smccc_call_handler(vcpu); } /* * This handles the cases where the system does not support FP/ASIMD or when * we are running nested virtualization and the guest hypervisor is trapping * FP/ASIMD accesses by its guest guest. * * All other handling of guest vs. host FP/ASIMD register state is handled in * fixup_guest_exit(). */ static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu) { if (guest_hyp_fpsimd_traps_enabled(vcpu)) return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); /* This is the case when the system doesn't support FP/ASIMD. */ kvm_inject_undefined(vcpu); return 1; } /** * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event * instruction executed by a guest * * @vcpu: the vcpu pointer * * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler * decides to. * WFI: Simply call kvm_vcpu_halt(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. * WFIT: Same as WFI, with a timed wakeup implemented as a background timer * * WF{I,E}T can immediately return if the deadline has already expired. */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); if (esr & ESR_ELx_WFx_ISS_WFE) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); vcpu->stat.wfe_exit_stat++; } else { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); vcpu->stat.wfi_exit_stat++; } if (esr & ESR_ELx_WFx_ISS_WFxT) { if (esr & ESR_ELx_WFx_ISS_RV) { u64 val, now; now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT); val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu)); if (now >= val) goto out; } else { /* Treat WFxT as WFx if RN is invalid */ esr &= ~ESR_ELx_WFx_ISS_WFxT; } } if (esr & ESR_ELx_WFx_ISS_WFE) { kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { if (esr & ESR_ELx_WFx_ISS_WFxT) vcpu_set_flag(vcpu, IN_WFIT); kvm_vcpu_wfi(vcpu); } out: kvm_incr_pc(vcpu); return 1; } /** * kvm_handle_guest_debug - handle a debug exception instruction * * @vcpu: the vcpu pointer * * We route all debug exceptions through the same handler. If both the * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); if (!vcpu->guest_debug && forward_debug_exception(vcpu)) return 1; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID; switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_WATCHPT_LOW: run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); kvm_pr_unimpl("Unknown exception class: esr: %#016llx -- %s\n", esr, esr_get_class_string(esr)); kvm_inject_undefined(vcpu); return 1; } /* * Guest access to SVE registers should be routed to this handler only * when the system doesn't support SVE. */ static int handle_sve(struct kvm_vcpu *vcpu) { if (guest_hyp_sve_traps_enabled(vcpu)) return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); kvm_inject_undefined(vcpu); return 1; } /* * Two possibilities to handle a trapping ptrauth instruction: * * - Guest usage of a ptrauth instruction (which the guest EL1 did not * turn into a NOP). If we get here, it is because we didn't enable * ptrauth for the guest. This results in an UNDEF, as it isn't * supposed to use ptrauth without being told it could. * * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for * which we reinject the exception into L1. * * Anything else is an emulation bug (hence the WARN_ON + UNDEF). */ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { if (!vcpu_has_ptrauth(vcpu)) { kvm_inject_undefined(vcpu); return 1; } if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } /* Really shouldn't be here! */ WARN_ON_ONCE(1); kvm_inject_undefined(vcpu); return 1; } static int kvm_handle_eret(struct kvm_vcpu *vcpu) { if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) && !vcpu_has_ptrauth(vcpu)) return kvm_handle_ptrauth(vcpu); /* * If we got here, two possibilities: * * - the guest is in EL2, and we need to fully emulate ERET * * - the guest is in EL1, and we need to reinject the * exception into the L1 hypervisor. * * If KVM ever traps ERET for its own use, we'll have to * revisit this. */ if (is_hyp_ctxt(vcpu)) kvm_emulate_nested_eret(vcpu); else kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } static int handle_svc(struct kvm_vcpu *vcpu) { /* * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so * we should only have to deal with a 64 bit exception. */ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return 1; } static exit_handle_fn arm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, [ESR_ELx_EC_CP15_32] = kvm_handle_cp15_32, [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, [ESR_ELx_EC_CP14_MR] = kvm_handle_cp14_32, [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, [ESR_ELx_EC_HVC64] = handle_hvc, [ESR_ELx_EC_SMC64] = handle_smc, [ESR_ELx_EC_SVC64] = handle_svc, [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_SVE] = handle_sve, [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) { u64 esr = kvm_vcpu_get_esr(vcpu); u8 esr_ec = ESR_ELx_EC(esr); return arm_exit_handlers[esr_ec]; } /* * We may be single-stepping an emulated instruction. If the emulation * has been completed in the kernel, we can return to userspace with a * KVM_EXIT_DEBUG, otherwise userspace needs to complete its * emulation first. */ static int handle_trap_exceptions(struct kvm_vcpu *vcpu) { int handled; /* * See ARM ARM B1.14.1: "Hyp traps on instructions * that fail their condition code check" */ if (!kvm_condition_valid(vcpu)) { kvm_incr_pc(vcpu); handled = 1; } else { exit_handle_fn exit_handler; exit_handler = kvm_get_exit_handler(vcpu); handled = exit_handler(vcpu); } return handled; } /* * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. */ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; if (ARM_SERROR_PENDING(exception_index)) { /* * The SError is handled by handle_exit_early(). If the guest * survives it will re-execute the original instruction. */ return 1; } exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: return 1; case ARM_EXCEPTION_TRAP: return handle_trap_exceptions(vcpu); case ARM_EXCEPTION_HYP_GONE: /* * EL2 has been reset to the hyp-stub. This happens when a guest * is pre-emptied by kvm_reboot()'s shutdown call. */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; case ARM_EXCEPTION_IL: /* * We attempted an illegal exception return. Guest state must * have been corrupted somehow. Give up. */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return 0; } } /* For exit types that need handling before we can be preempted */ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) { if (ARM_SERROR_PENDING(exception_index)) { if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { u64 disr = kvm_vcpu_get_disr(vcpu); kvm_handle_guest_serror(vcpu, disr_to_esr(disr)); } else { kvm_inject_vabt(vcpu); } return; } exception_index = ARM_EXCEPTION_CODE(exception_index); if (exception_index == ARM_EXCEPTION_EL1_SERROR) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } static void print_nvhe_hyp_panic(const char *name, u64 panic_addr) { kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr, (void *)(panic_addr + kaslr_offset())); } static void kvm_nvhe_report_cfi_failure(u64 panic_addr) { print_nvhe_hyp_panic("CFI failure", panic_addr); if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n"); } void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { u64 elr_in_kimg = __phys_to_kimg(elr_phys); u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; u64 panic_addr = elr_virt + hyp_offset; if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && esr_brk_comment(esr) == BUG_BRK_IMM) { const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ if (!is_protected_kvm_enabled() || IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { struct bug_entry *bug = find_bug(elr_in_kimg); if (bug) bug_get_file_line(bug, &file, &line); } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else { print_nvhe_hyp_panic("panic", panic_addr); } /* Dump the nVHE hypervisor backtrace */ kvm_nvhe_dump_backtrace(hyp_offset); /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're * also safe to reveal the hyp offset as a debugging aid for translating * hyp VAs to vmlinux addresses. */ kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%016llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", spsr, elr_virt, esr, far, hpfar, par, vcpu); } |
355 528 603 4 605 603 605 553 551 527 549 553 553 555 553 552 1 553 555 555 552 1 552 555 3 553 554 550 550 31 525 11 20 20 11 11 11 11 11 11 982 983 4 4 355 355 7 356 356 355 355 7 355 355 355 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/fs_struct.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" #include "internal.h" struct prepend_buffer { char *buf; int len; }; #define DECLARE_BUFFER(__name, __buf, __len) \ struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} static char *extract_string(struct prepend_buffer *p) { if (likely(p->len >= 0)) return p->buf; return ERR_PTR(-ENAMETOOLONG); } static bool prepend_char(struct prepend_buffer *p, unsigned char c) { if (likely(p->len > 0)) { p->len--; *--p->buf = c; return true; } p->len = -1; return false; } /* * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel * copy might fault as a result. * * The end result will correct itself when we check the * rename sequence count, but we need to be able to handle * the fault gracefully. */ static bool prepend_copy(void *dst, const void *src, int len) { if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len); return false; } return true; } static bool prepend(struct prepend_buffer *p, const char *str, int namelen) { // Already overflowed? if (p->len < 0) return false; // Will overflow? if (p->len < namelen) { // Fill as much as possible from the end of the name str += namelen - p->len; p->buf -= p->len; prepend_copy(p->buf, str, p->len); p->len = -1; return false; } // Fits fully p->len -= namelen; p->buf -= namelen; return prepend_copy(p->buf, str, namelen); } /** * prepend_name - prepend a pathname in front of current buffer pointer * @p: prepend buffer which contains buffer pointer and allocated length * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * But since the length cannot be trusted, we need to copy the name very * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Load acquire is needed to make sure that we see the new name data even * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, const struct path *root, struct prepend_buffer *p) { while (dentry != root->dentry || &mnt->mnt != root->mnt) { const struct dentry *parent = READ_ONCE(dentry->d_parent); if (dentry == mnt->mnt.mnt_root) { struct mount *m = READ_ONCE(mnt->mnt_parent); struct mnt_namespace *mnt_ns; if (likely(mnt != m)) { dentry = READ_ONCE(mnt->mnt_mountpoint); mnt = m; continue; } /* Global root */ mnt_ns = READ_ONCE(mnt->mnt_ns); /* open-coded is_mounted() to use local mnt_ns */ if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) return 1; // absolute root else return 2; // detached or not attached yet } if (unlikely(dentry == parent)) /* Escaped? */ return 3; prefetch(parent); if (!prepend_name(p, &dentry->d_name)) break; dentry = parent; } return 0; } /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, struct prepend_buffer *p) { unsigned seq, m_seq = 0; struct prepend_buffer b; int error; rcu_read_lock(); restart_mnt: read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; rcu_read_lock(); restart: b = *p; read_seqbegin_or_lock(&rename_lock, &seq); error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); if (need_seqretry(&mount_lock, m_seq)) { m_seq = 1; goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (unlikely(error == 3)) b = *p; if (b.len == p->len) prepend_char(&b, '/'); *p = b; return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; DECLARE_BUFFER(b, buf, buflen); prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; } while (read_seqcount_retry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); struct path root; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes |